Spaces:

thejagstudio
/

diffusionGPT

Sleeping

App Files Files Community

thejagstudio commited on Nov 29, 2025

Commit

486838c

verified ·

1 Parent(s): 3f3700b

Upload 10 files

Browse files

Files changed (10) hide show

Dockerfile +7 -0
app.py +475 -0
checkpoints/model_fp32.pt +3 -0
design.json +185 -0
infer-base.py +778 -0
infer-chat.py +656 -0
requirements.txt +5 -0
static/ai.mp4 +0 -0
static/index.html +156 -0
static/main.js +346 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,7 @@

+FROM python:3
+WORKDIR /usr/src/app
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python","./app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import os
+import sys
+import json
+import time
+import importlib.util
+from pathlib import Path
+from flask import Flask, request, jsonify, Response, stream_with_context
+from flask_cors import CORS
+import torch
+from transformers import AutoTokenizer
+app = Flask(__name__, static_folder='static', static_url_path='/static')
+CORS(app)
+# Global state
+model = None
+tokenizer = None
+config = None
+device = None
+DiffusionLLM = None
+chat_function = None
+def find_file(filename, search_dirs=None):
+    """Find a file in current directory or parent directories."""
+    if search_dirs is None:
+        search_dirs = [
+            os.path.dirname(__file__),  # Current directory
+            os.path.dirname(os.path.dirname(__file__)),  # Parent directory
+            os.getcwd(),  # Working directory
+        ]
+    for directory in search_dirs:
+        filepath = os.path.join(directory, filename)
+        if os.path.exists(filepath):
+            print(f"Found {filename} at: {filepath}")
+            return filepath
+    return None
+def try_import_module(filepath, module_name):
+    """Dynamically import a Python file as a module."""
+    if not filepath or not os.path.exists(filepath):
+        return None
+    try:
+        # Add the directory to sys.path
+        module_dir = os.path.dirname(filepath)
+        if module_dir not in sys.path:
+            sys.path.insert(0, module_dir)
+        spec = importlib.util.spec_from_file_location(module_name, filepath)
+        if spec is None:
+            print(f"Could not create spec for {filepath}")
+            return None
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        print(f"Successfully imported {module_name} from {filepath}")
+        return module
+    except Exception as e:
+        print(f"Error importing {filepath}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def load_model_internal():
+    """Load the model and tokenizer."""
+    global model, tokenizer, config, device, DiffusionLLM, chat_function
+    if model is not None:
+        return True
+    try:
+        print("=" * 60)
+        print("Starting model loading process...")
+        print("=" * 60)
+        # Find and import infer-base.py
+        base_path = find_file("infer-base.py")
+        if base_path is None:
+            raise RuntimeError("Could not find infer-base.py. Make sure it's in the same directory as app.py or parent directory.")
+        print(f"\nImporting infer-base.py from: {base_path}")
+        base_mod = try_import_module(base_path, "infer_base")
+        if base_mod is None:
+            raise RuntimeError("Failed to import infer-base.py")
+        # Check for DiffusionLLM class
+        if not hasattr(base_mod, 'DiffusionLLM'):
+            print("Available attributes in infer_base:", dir(base_mod))
+            raise RuntimeError("DiffusionLLM class not found in infer-base.py")
+        DiffusionLLM = base_mod.DiffusionLLM
+        print("✓ Successfully loaded DiffusionLLM class")
+        # Find and import infer-chat.py
+        chat_path = find_file("infer-chat.py")
+        if chat_path is None:
+            raise RuntimeError("Could not find infer-chat.py")
+        print(f"\nImporting infer-chat.py from: {chat_path}")
+        chat_mod = try_import_module(chat_path, "infer_chat")
+        if chat_mod is None or not hasattr(chat_mod, 'chat'):
+            raise RuntimeError("Failed to import chat function from infer-chat.py")
+        chat_function = chat_mod.chat
+        print("✓ Successfully loaded chat function")
+        # Setup pickling workaround for torch.load
+        try:
+            if hasattr(base_mod, 'ModelConfig'):
+                sys.modules['__main__'].ModelConfig = base_mod.ModelConfig
+            sys.modules['__main__'].DiffusionLLM = DiffusionLLM
+            print("✓ Configured pickle support for model loading")
+        except Exception as e:
+            print(f"Warning: Could not setup pickle workaround: {e}")
+        # Set device
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"\n✓ Using device: {device}")
+        # Load tokenizer
+        print("\nLoading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        print("✓ Tokenizer loaded")
+        # Find model checkpoint
+        checkpoint_dirs = [
+            "checkpoints",
+            "../checkpoints",
+            "./checkpoints",
+            os.path.join(os.path.dirname(__file__), "checkpoints"),
+            os.path.join(os.path.dirname(__file__), "../checkpoints"),
+        ]
+        model_path = None
+        for checkpoint_dir in checkpoint_dirs:
+            best_path = os.path.join(checkpoint_dir, "best_model.pt")
+            fp32_path = os.path.join(checkpoint_dir, "model_fp32.pt")
+            if os.path.exists(best_path):
+                model_path = best_path
+                break
+            elif os.path.exists(fp32_path):
+                model_path = fp32_path
+                break
+        if model_path is None:
+            raise RuntimeError(
+                "Could not find model checkpoint. Looking for:\n"
+                "  - checkpoints/best_model.pt\n"
+                "  - checkpoints/model_fp32.pt\n"
+                f"Searched directories: {checkpoint_dirs}"
+            )
+        print(f"\n✓ Found model checkpoint: {model_path}")
+        print("Loading model weights (this may take a minute)...")
+        # Load model
+        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+        config = checkpoint['config']
+        print("Creating model...")
+        model = DiffusionLLM(config)
+        print("Loading state dict...")
+        state_dict = checkpoint['model_state']
+        state_dict = {k: v.float() for k, v in state_dict.items()}
+        model.load_state_dict(state_dict)
+        model = model.to(device)
+        model.eval()
+        num_params = sum(p.numel() for p in model.parameters()) / 1e6
+        print(f"\n{'=' * 60}")
+        print(f"✓✓✓ MODEL LOADED SUCCESSFULLY ✓✓✓")
+        print(f"{'=' * 60}")
+        print(f"Parameters: {num_params:.1f}M")
+        if 'step' in checkpoint:
+            print(f"Training steps: {checkpoint['step']}")
+        if 'best_val_loss' in checkpoint:
+            print(f"Best validation loss: {checkpoint['best_val_loss']:.4f}")
+        print(f"{'=' * 60}\n")
+        return True
+    except Exception as e:
+        print("\n" + "=" * 60)
+        print("ERROR LOADING MODEL")
+        print("=" * 60)
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+        print("=" * 60 + "\n")
+        return False
+def create_streaming_visualizer():
+    """Create a visualizer that yields SSE events instead of printing to terminal."""
+    def visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
+        # Normalize inputs to lists
+        if not isinstance(mask_blocks, list):
+            mask_blocks = [mask_blocks]
+            is_masked_list = [is_masked_list]
+        # Decode context
+        try:
+            context_text = tok.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
+        except Exception:
+            context_text = str(context_ids[0].tolist())
+        # Build blocks visualization
+        all_blocks = []
+        for block_idx, (mask_block, is_masked) in enumerate(zip(mask_blocks, is_masked_list)):
+            block_tokens = mask_block[0].tolist()
+            block_data = []
+            for i, token_id in enumerate(block_tokens):
+                if is_masked[0, i]:
+                    block_data.append({
+                        'type': 'masked',
+                        'text': '███'
+                    })
+                else:
+                    try:
+                        token_text = tok.decode([token_id], skip_special_tokens=False)
+                    except Exception:
+                        token_text = str(int(token_id))
+                    block_data.append({
+                        'type': 'revealed',
+                        'text': token_text
+                    })
+            all_blocks.append({
+                'block_index': block_idx,
+                'tokens': block_data
+            })
+        # Return data structure that will be sent as SSE
+        return {
+            'context': context_text,
+            'blocks': all_blocks,
+            'num_blocks': len(mask_blocks)
+        }
+    return visualizer
+@app.route('/')
+def index():
+    """Serve the main HTML page."""
+    return app.send_static_file('index.html')
+@app.route('/api/load', methods=['POST'])
+def load_model_endpoint():
+    """Load the model."""
+    data = request.json or {}
+    check_only = data.get('check_only', False)
+    global model
+    if check_only:
+        return jsonify({
+            'loaded': model is not None,
+            'message': 'Model is loaded' if model is not None else 'Model not loaded'
+        })
+    if model is not None:
+        return jsonify({
+            'loaded': True,
+            'message': 'Model already loaded'
+        })
+    success = load_model_internal()
+    if success:
+        return jsonify({
+            'loaded': True,
+            'message': 'Model loaded successfully'
+        })
+    else:
+        return jsonify({
+            'loaded': False,
+            'message': 'Failed to load model. Check server logs for details.'
+        }), 500
+@app.route('/api/generate', methods=['POST'])
+def generate():
+    """Generate response without streaming."""
+    global model, tokenizer, config, device, chat_function
+    if model is None:
+        return jsonify({'error': 'Model not loaded'}), 400
+    if chat_function is None:
+        return jsonify({'error': 'Chat function not available'}), 400
+    data = request.json
+    instruction = data.get('instruction', '')
+    steps = data.get('steps', 64)
+    block_size = data.get('block_size', 128)
+    max_new_tokens = data.get('max_new_tokens', 128)
+    parallel_blocks = data.get('parallel_blocks', 1)
+    if not instruction:
+        return jsonify({'error': 'No instruction provided'}), 400
+    try:
+        # Generate response
+        raw_output, response = chat_function(
+            model,
+            tokenizer,
+            instruction,
+            steps=steps,
+            block_size=block_size,
+            max_new_tokens=max_new_tokens,
+            temperature=0.8,
+            top_k=50,
+            top_p=0.9,
+            repetition_penalty=1.2,
+            no_repeat_ngram_size=3,
+            verbose=False,
+            visualize_fn=None,
+            parallel_blocks=parallel_blocks,
+        )
+        return jsonify({
+            'response': response,
+            'raw_output': raw_output
+        })
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/generate-stream', methods=['POST'])
+def generate_stream():
+    """Generate response with streaming visualization."""
+    global model, tokenizer, config, device, chat_function
+    if model is None:
+        return jsonify({'error': 'Model not loaded'}), 400
+    if chat_function is None:
+        return jsonify({'error': 'Chat function not available'}), 400
+    data = request.json
+    instruction = data.get('instruction', '')
+    steps = data.get('steps', 64)
+    block_size = data.get('block_size', 128)
+    max_new_tokens = data.get('max_new_tokens', 128)
+    parallel_blocks = data.get('parallel_blocks', 1)
+    if not instruction:
+        return jsonify({'error': 'No instruction provided'}), 400
+    def generate_events():
+        try:
+            # Import threading to allow yielding from callback
+            import queue
+            event_queue = queue.Queue()
+            generation_complete = {'done': False, 'result': None}
+            def streaming_visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
+                """This gets called during generation - we need to send events immediately"""
+                visualizer = create_streaming_visualizer()
+                data = visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear)
+                # Put the update in the queue so it can be yielded immediately
+                event_queue.put({'type': 'update', 'data': data})
+            # Start generation in a separate thread so we can yield events as they come
+            import threading
+            def run_generation():
+                try:
+                    raw_output, response = chat_function(
+                        model,
+                        tokenizer,
+                        instruction,
+                        steps=steps,
+                        block_size=block_size,
+                        max_new_tokens=max_new_tokens,
+                        temperature=0.8,
+                        top_k=50,
+                        top_p=0.9,
+                        repetition_penalty=1.2,
+                        no_repeat_ngram_size=3,
+                        verbose=False,
+                        visualize_fn=streaming_visualizer,
+                        parallel_blocks=parallel_blocks,
+                    )
+                    generation_complete['result'] = (raw_output, response)
+                except Exception as e:
+                    generation_complete['result'] = ('error', str(e))
+                finally:
+                    generation_complete['done'] = True
+                    event_queue.put(None)  # Signal completion
+            # Start generation thread
+            gen_thread = threading.Thread(target=run_generation)
+            gen_thread.daemon = True
+            gen_thread.start()
+            # Yield start event
+            yield f"data: {json.dumps({'type': 'start', 'message': 'Generation started'})}\n\n"
+            # Yield events as they come from the queue
+            while not generation_complete['done'] or not event_queue.empty():
+                try:
+                    event = event_queue.get(timeout=0.1)
+                    if event is None:  # Completion signal
+                        break
+                    yield f"data: {json.dumps(event)}\n\n"
+                except queue.Empty:
+                    continue
+            # Wait for thread to finish
+            gen_thread.join(timeout=1.0)
+            # Send final response
+            if generation_complete['result']:
+                raw_output, response = generation_complete['result']
+                if raw_output == 'error':
+                    yield f"data: {json.dumps({'type': 'error', 'error': response})}\n\n"
+                else:
+                    yield f"data: {json.dumps({'type': 'complete', 'response': response, 'raw_output': raw_output})}\n\n"
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            yield f"data: {json.dumps({'type': 'error', 'error': str(e)})}\n\n"
+    return Response(
+        stream_with_context(generate_events()),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'
+        }
+    )
+@app.route('/api/test-stream', methods=['GET'])
+def test_stream():
+    """Test streaming endpoint."""
+    def generate():
+        for i in range(10):
+            yield f"data: {json.dumps({'message': f'Test message {i+1}'})}\n\n"
+            time.sleep(0.5)
+        yield f"data: {json.dumps({'message': 'Stream complete'})}\n\n"
+    return Response(
+        stream_with_context(generate()),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'
+        }
+    )
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000, threaded=True)

checkpoints/model_fp32.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26b941c479671cff7d0d93fc1d30711ce717de1abedee1e30c0871a4874db79d
+size 491091299

design.json ADDED Viewed

	@@ -0,0 +1,185 @@

+{
+    "design_system": {
+        "name": "Cortex Luminance System",
+        "description": "A physics-based design system combining soft aesthetic minimalism with strict luminance layering. It relies on lighting simulation (top highlights, bottom shadows) rather than diverse hues to create depth hierarchy.",
+        "version": "1.0.0",
+        "mode": "light",
+        "philosophy": {
+            "core_principle": "Depth through Luminance",
+            "lighting_source": "Top-down (90 degrees)",
+            "surface_material": "Matte white & Soft Glass",
+            "accent_strategy": "Functional Purple (oklch 0.65 0.22 290)",
+            "layering_logic": "Higher elevation = Higher lightness (or pure white) + Stronger Shadow. Lower elevation = Lower lightness + Inset Shadow."
+        }
+    },
+    "tokens": {
+        "colors": {
+            "primitives": {
+                "base_hue": "270 (Purple/Violet)",
+                "neutral_hue": "265 (Cool Gray)"
+            },
+            "layers": {
+                "bg_root": {
+                    "value": "linear-gradient(135deg, oklch(0.95 0.02 270) 0%, oklch(0.92 0.03 290) 100%)",
+                    "description": "Level 0: The ambient canvas. Corresponds to the blurry cloud/gradient background."
+                },
+                "bg_layer_1": {
+                    "value": "oklch(0.99 0.005 265)",
+                    "description": "Level 1: The main application window/sidebar surface. Almost white."
+                },
+                "bg_layer_2": {
+                    "value": "oklch(1.0 0 0)",
+                    "description": "Level 2: Cards, Floating Inputs, Modals. Pure White."
+                },
+                "bg_sunken": {
+                    "value": "oklch(0.96 0.01 265)",
+                    "description": "For inset elements (search bars, progress tracks). Slightly darker than layer 1 to simulate depth."
+                }
+            },
+            "text": {
+                "primary": "oklch(0.20 0.02 265)",
+                "secondary": "oklch(0.55 0.03 265)",
+                "accent": "oklch(0.65 0.22 290)"
+            },
+            "borders": {
+                "subtle": "rgba(0, 0, 0, 0.06)",
+                "highlight": "rgba(255, 255, 255, 0.8)"
+            }
+        },
+        "typography": {
+            "font_family": "Inter, SF Pro Display, system-ui, sans-serif",
+            "weights": {
+                "regular": 400,
+                "medium": 500,
+                "semibold": 600
+            },
+            "scale": {
+                "h1": {
+                    "size": "32px",
+                    "weight": 600,
+                    "letter_spacing": "-0.02em"
+                },
+                "h2": {
+                    "size": "24px",
+                    "weight": 500,
+                    "letter_spacing": "-0.01em"
+                },
+                "body_lg": {
+                    "size": "16px",
+                    "weight": 400
+                },
+                "body_sm": {
+                    "size": "14px",
+                    "weight": 400
+                },
+                "caption": {
+                    "size": "12px",
+                    "weight": 500,
+                    "uppercase": false
+                }
+            }
+        },
+        "spacing": {
+            "xs": "4px",
+            "sm": "8px",
+            "md": "16px",
+            "lg": "24px",
+            "xl": "32px",
+            "container_padding": "20px"
+        },
+        "radii": {
+            "sm": "8px",
+            "md": "12px",
+            "lg": "16px",
+            "full": "9999px (Pill)"
+        },
+        "shadows": {
+            "note": "Shadows must imply a top-down light source. Always pair drop-shadows with top-edge inset highlights.",
+            "elevation_low": {
+                "css_value": "box-shadow: inset 0 1px 0 0 rgba(255, 255, 255, 1), 0 1px 2px 0 rgba(0, 0, 0, 0.05)",
+                "use_case": "Interactive buttons, list items."
+            },
+            "elevation_medium": {
+                "css_value": "box-shadow: inset 0 1px 0 0 rgba(255, 255, 255, 1), 0 4px 6px -1px rgba(0, 0, 0, 0.05), 0 2px 4px -1px rgba(0, 0, 0, 0.03)",
+                "use_case": "Standard Cards (Saved Prompts, Suggestions)."
+            },
+            "elevation_high": {
+                "css_value": "box-shadow: inset 0 1px 0 0 rgba(255, 255, 255, 1), 0 10px 15px -3px rgba(0, 0, 0, 0.08), 0 4px 6px -2px rgba(0, 0, 0, 0.04)",
+                "use_case": "Floating Input Area, Modals."
+            },
+            "inset_sunken": {
+                "css_value": "box-shadow: inset 0 2px 4px 0 rgba(0, 0, 0, 0.06), inset 0 -1px 0 0 rgba(255, 255, 255, 0.5)",
+                "use_case": "Search bars, tracks, unselected states."
+            }
+        }
+    },
+    "components": {
+        "layout_structure": {
+            "sidebar": {
+                "width": "260px",
+                "background": "bg_layer_1",
+                "border_right": "1px solid borders.subtle",
+                "padding": "md",
+                "style": "Flat surface, low contrast."
+            },
+            "main_area": {
+                "background": "bg_layer_2 (with large rounded corners) OR transparent over bg_root",
+                "layout": "Flex-col, centered content, maximum width 900px."
+            }
+        },
+        "buttons": {
+            "primary": {
+                "bg": "black (or dark purple)",
+                "text": "white",
+                "radius": "md",
+                "shadow": "elevation_low",
+                "lighting": "Subtle top gradient (lighter top) to show curvature."
+            },
+            "ghost": {
+                "bg": "transparent",
+                "hover_bg": "rgba(0,0,0,0.04)",
+                "text": "text.secondary"
+            },
+            "new_chat": {
+                "style": "Pill shape / Full radius",
+                "bg": "#1A1A1A",
+                "text": "white",
+                "icon": "plus"
+            }
+        },
+        "cards": {
+            "prompt_card": {
+                "bg": "bg_layer_2",
+                "radius": "lg",
+                "shadow": "elevation_medium",
+                "border": "1px solid borders.subtle",
+                "hover": "Transform Y -2px, increase shadow to elevation_high."
+            }
+        },
+        "inputs": {
+            "search_bar": {
+                "style": "Sunken / Inset",
+                "bg": "bg_sunken",
+                "shadow": "inset_sunken",
+                "radius": "md",
+                "icon_color": "text.secondary"
+            },
+            "main_prompt_area": {
+                "style": "Elevated Container",
+                "bg": "white",
+                "shadow": "elevation_high",
+                "radius": "lg",
+                "border": "1px solid rgba(0,0,0,0.04)"
+            }
+        },
+        "navigation_items": {
+            "base_style": "text.secondary, font-medium, md padding",
+            "active_state": {
+                "bg": "bg_layer_2",
+                "text": "text.primary",
+                "shadow": "elevation_low",
+                "radius": "sm"
+            }
+        }
+    }
+}

infer-base.py ADDED Viewed

	@@ -0,0 +1,778 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from dataclasses import dataclass
+import os
+import math
+# ============== Model Architecture ==============
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        var = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(var + self.eps)
+        return self.weight * x
+class RotaryEmbedding(nn.Module):
+    """Rotary Position Embeddings (RoPE) with NTK extrapolation."""
+    def __init__(self, dim, max_position_embeddings=16384, base=100000, scaling_factor=1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.base = base
+        self.max_position_embeddings = max_position_embeddings
+        self.inv_freq = None
+        self._cache = {}
+    def _update_freqs(self, device):
+        base = self.base * (self.scaling_factor ** (self.dim / (self.dim - 2)))
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.inv_freq = inv_freq
+    def forward(self, x, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[-2]
+        if self.inv_freq is None or self.inv_freq.device != x.device:
+            self._update_freqs(x.device)
+        cache_key = (seq_len, x.device, x.dtype)
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos()[None, None, :, :]
+        sin = emb.sin()[None, None, :, :]
+        self._cache[cache_key] = (cos, sin)
+        if len(self._cache) > 10:
+            self._cache.pop(next(iter(self._cache)))
+        return cos, sin
+def apply_rotary_pos_emb(q, k, cos, sin):
+    """Apply rotary embeddings to Q and K."""
+    def rotate_half(x):
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2:]
+        return torch.cat((-x2, x1), dim=-1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class DiffusionAttention(nn.Module):
+    """Multi-head attention with GQA and Flash Attention support."""
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.use_flash_attn = config.use_flash_attn
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    def forward(self, hidden_states, freqs_cis, attention_mask=None, past_kv=None):
+        bsz, q_len, _ = hidden_states.size()
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = freqs_cis
+        cos = cos[:, :, :q_len, :]
+        sin = sin[:, :, :q_len, :]
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        if past_kv is not None:
+            cache_k, cache_v = past_kv
+            k = torch.cat([cache_k, k], dim=2)
+            v = torch.cat([cache_v, v], dim=2)
+        current_kv = (k, v)
+        k = k.repeat_interleave(self.num_key_value_groups, dim=1)
+        v = v.repeat_interleave(self.num_key_value_groups, dim=1)
+        attn_mask = None
+        if attention_mask is not None:
+            attn_mask = attention_mask[:, None, None, :].to(dtype=q.dtype)
+            attn_mask = (1.0 - attn_mask) * torch.finfo(q.dtype).min
+        output = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+        )
+        output = output.transpose(1, 2).contiguous().view(bsz, q_len, self.hidden_size)
+        return self.o_proj(output), current_kv
+class MLP(nn.Module):
+    """Gated MLP with SiLU activation."""
+    def __init__(self, config):
+        super().__init__()
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class BlockDiffusionBlock(nn.Module):
+    """Transformer block with pre-norm, attention, and MLP."""
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = DiffusionAttention(config)
+        self.mlp = MLP(config)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.use_activation_checkpointing = config.use_activation_checkpointing
+    def forward(self, hidden_states, freqs_cis, attention_mask, past_kv):
+        return self._forward(hidden_states, freqs_cis, attention_mask, past_kv)
+    def _forward(self, hidden_states, freqs_cis, attention_mask, past_kv):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_out, new_kv = self.self_attn(hidden_states, freqs_cis, attention_mask, past_kv)
+        hidden_states = residual + attn_out
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + self.mlp(hidden_states)
+        return hidden_states, new_kv
+@dataclass
+class ModelConfig:
+    """Model architecture configuration."""
+    vocab_size: int = 151936
+    hidden_size: int = 1024
+    intermediate_size: int = 2816
+    num_hidden_layers: int = 16
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 4
+    max_position_embeddings: int = 16384
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 100000.0
+    pad_token_id: int = 0
+    mask_token_id: int = 1
+    use_flash_attn: bool = True
+    use_activation_checkpointing: bool = False
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+class DiffusionLLM(nn.Module):
+    """Complete diffusion language model."""
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        pad_idx = config.pad_token_id if config.pad_token_id < config.vocab_size else None
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=pad_idx)
+        self.layers = nn.ModuleList([BlockDiffusionBlock(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rotary_emb = RotaryEmbedding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings
+        )
+        self.lm_head.weight = self.embed_tokens.weight
+    def forward(self, input_ids, attention_mask=None, past_key_values=None):
+        bsz, seqlen = input_ids.shape
+        hidden_states = self.embed_tokens(input_ids)
+        freqs_cis = self.rotary_emb(hidden_states, seq_len=seqlen)
+        if past_key_values is None:
+            past_key_values = [None] * len(self.layers)
+        new_kvs = []
+        for i, layer in enumerate(self.layers):
+            hidden_states, kv = layer(hidden_states, freqs_cis, attention_mask, past_key_values[i])
+            new_kvs.append(kv)
+        hidden_states = self.norm(hidden_states)
+        logits = self.lm_head(hidden_states)
+        return logits, new_kvs
+    def get_num_params(self, trainable_only=True):
+        if trainable_only:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad)
+        else:
+            return sum(p.numel() for p in self.parameters())
+# ============== Inference Functions ==============
+def load_model(model_path: str, device: str = 'cuda'):
+    """Load a saved model (fp16 or fp32) for inference."""
+    print(f"Loading model from {model_path}...")
+    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+    config = checkpoint['config']
+    model = DiffusionLLM(config)
+    state_dict = checkpoint['model_state']
+    state_dict = {k: v.float() for k, v in state_dict.items()}
+    model.load_state_dict(state_dict)
+    model = model.to(device)
+    model.eval()
+    num_params = model.get_num_params() / 1e6
+    file_size = os.path.getsize(model_path) / 1e6
+    print(f"✓ Model loaded: {num_params:.1f}M params from {file_size:.1f} MB file")
+    return model, config
+def visualize_diffusion_state(tokenizer, context_ids, mask_blocks, is_masked_list, config, clear=True, block_colors=None):
+    """Visualize the current state of diffusion generation with multiple blocks.
+    Args:
+        mask_blocks: Either a single block tensor (1, block_size) or list of block tensors
+        is_masked_list: Either a single mask tensor (1, block_size) or list of mask tensors
+        block_colors: List of ANSI color codes for each block. If None, uses defaults.
+    """
+    import sys
+    import os
+    # Default colors for different blocks (green, cyan, yellow, magenta)
+    DEFAULT_COLORS = ['\033[92m', '\033[96m', '\033[93m', '\033[95m']
+    MASK_COLOR = '\033[90m'  # Gray for masked tokens
+    RESET = '\033[0m'
+    # Normalize inputs to lists
+    if not isinstance(mask_blocks, list):
+        mask_blocks = [mask_blocks]
+        is_masked_list = [is_masked_list]
+    if block_colors is None:
+        block_colors = DEFAULT_COLORS
+    # Decode context (prompt + previously generated blocks) and replace newlines
+    context_text = tokenizer.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
+    # Build visualization for all blocks
+    all_blocks_text = []
+    for block_idx, (mask_block, is_masked) in enumerate(zip(mask_blocks, is_masked_list)):
+        color = block_colors[block_idx % len(block_colors)]
+        block_tokens = mask_block[0].tolist()
+        block_color_tokens = []
+        for i, token_id in enumerate(block_tokens):
+            if is_masked[0, i]:
+                # Use block-specific color for masked tokens to distinguish blocks
+                block_color_tokens.append(f'{MASK_COLOR}██{RESET}')
+            else:
+                # Decode individual token; use block color for revealed tokens
+                token_text = tokenizer.decode([token_id], skip_special_tokens=False)
+                block_color_tokens.append(f'{color}{token_text}{RESET}')
+        all_blocks_text.append(''.join(block_color_tokens))
+    # Join all blocks with a subtle separator
+    blocks_combined = ''.join(all_blocks_text)
+    # Clear entire terminal
+    if clear:
+        clear_cmd = 'cls' if os.name == 'nt' else 'clear'
+        try:
+            os.system(clear_cmd)
+        except Exception:
+            sys.stdout.write('\r\033[K')
+    # Print legend for parallel blocks
+    if len(mask_blocks) > 1:
+        legend_parts = []
+        for i in range(len(mask_blocks)):
+            color = block_colors[i % len(block_colors)]
+            legend_parts.append(f'{color}Block {i+1}{RESET}')
+        print(f"Generating: {' | '.join(legend_parts)}\n")
+    # Print the full context with colored blocks
+    print(f"{context_text}{blocks_combined}", flush=True)
+def demo_visualize_truncation():
+    """Demo for visualize_diffusion_state without a full model.
+    Simulates streaming output and verifies there is no line duplication when content exceeds terminal width.
+    """
+    class MockTokenizer:
+        def __init__(self):
+            # Map token id to token text (simple ASCII characters and spaces)
+            self.vocab = {i: chr(65 + (i % 26)) for i in range(256)}
+            self.vocab[32] = ' '
+            self.eos_token = '\n'
+            self.pad_token = ' '
+        def decode(self, ids, skip_special_tokens=True):
+            # ids can be tensor or list
+            if isinstance(ids, torch.Tensor):
+                ids = ids.tolist()
+            if isinstance(ids, (list, tuple)):
+                return ''.join(self.vocab.get(int(i) % 256, '?') for i in ids)
+            return str(ids)
+    tok = MockTokenizer()
+    # Create a long context and a block that's also long
+    # Make context exceed terminal width
+    term_width = 80
+    long_context_ids = torch.tensor([[i % 26 + 65 for i in range(120)]], dtype=torch.long)
+    block_size = 32
+    mask_block = torch.full((1, block_size), 32, dtype=torch.long)  # spaces
+    is_masked = torch.ones(1, block_size, dtype=torch.bool)
+    for i in range(0, block_size, 3):
+        is_masked[0, i] = False
+        mask_block[0, i] = 65 + (i % 26)
+    print('\nRunning demo: long prompt + block to test truncation\n')
+    for i in range(8):
+        visualize_diffusion_state(tok, long_context_ids, [mask_block], [is_masked], ModelConfig(), clear=(i > 0))
+        # rotate some tokens to simulate diffusion
+        mask_block = torch.roll(mask_block, shifts=1, dims=1)
+        time_delay = 0.08
+        try:
+            import time
+            time.sleep(time_delay)
+        except Exception:
+            pass
+    print('\n\nDemo completed.')
+@torch.no_grad()
+def generate_block_diffusion(
+    model,
+    tokenizer,
+    prompt: str,
+    steps: int = 16,
+    block_size: int = 64,
+    max_new_tokens: int = 256,
+    device: str = 'cuda',
+    temperature: float = 1.0,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.2,
+    no_repeat_ngram_size: int = 3,
+    visualize: bool = False,
+    parallel_blocks: int = 1,  # Number of blocks to generate in parallel
+):
+    """Generate text using block diffusion with proper sampling and repetition control.
+    Args:
+        visualize: If True, stream output in real-time showing the diffusion effect.
+        parallel_blocks: Number of blocks to generate in parallel (1-4 recommended).
+    """
+    import time
+    model.eval()
+    prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    config = model.module.config if hasattr(model, 'module') else model.config
+    if hasattr(model, '_orig_mod'):
+        config = model._orig_mod.config
+    num_blocks = max_new_tokens // block_size
+    parallel_blocks = min(parallel_blocks, num_blocks)  # Can't parallelize more than total blocks
+    if not visualize:
+        if parallel_blocks > 1:
+            print(f"Generating {num_blocks} blocks of {block_size} tokens each ({parallel_blocks} blocks in parallel)...")
+        else:
+            print(f"Generating {num_blocks} blocks of {block_size} tokens each...")
+    else:
+        print(f"\n\033[94mStarting diffusion generation...\033[0m\n")
+        print(prompt, end='', flush=True)
+    context_ids = prompt_ids
+    all_generated_tokens = set(prompt_ids[0].tolist())
+    # Process blocks in batches of parallel_blocks
+    blocks_generated = 0
+    while blocks_generated < num_blocks:
+        # Determine how many blocks to generate this iteration
+        current_parallel = min(parallel_blocks, num_blocks - blocks_generated)
+        if current_parallel > 1:
+            # Parallel block generation
+            generated_blocks = _generate_parallel_blocks(
+                model, tokenizer, context_ids, config, device,
+                current_parallel, block_size, steps, temperature,
+                top_k, top_p, repetition_penalty, no_repeat_ngram_size,
+                all_generated_tokens, visualize
+            )
+            # Concatenate all generated blocks to context
+            for block in generated_blocks:
+                context_ids = torch.cat([context_ids, block], dim=1)
+                all_generated_tokens.update(block[0].tolist())
+            if not visualize:
+                print(f"  Blocks {blocks_generated + 1}-{blocks_generated + current_parallel}/{num_blocks} complete")
+            blocks_generated += current_parallel
+        else:
+            # Single block generation (original logic)
+            mask_block, block_token_history = _generate_single_block(
+                model, tokenizer, context_ids, config, device,
+                block_size, steps, temperature, top_k, top_p,
+                repetition_penalty, no_repeat_ngram_size,
+                all_generated_tokens, visualize
+            )
+            context_ids = torch.cat([context_ids, mask_block], dim=1)
+            all_generated_tokens.update(mask_block[0].tolist())
+            if not visualize:
+                print(f"  Block {blocks_generated + 1}/{num_blocks} complete")
+            blocks_generated += 1
+    if visualize:
+        # Final newline after visualization
+        print("\n")
+    generated_ids = context_ids[0].tolist()
+    return tokenizer.decode(generated_ids, skip_special_tokens=True)
+def _generate_single_block(
+    model, tokenizer, context_ids, config, device,
+    block_size, steps, temperature, top_k, top_p,
+    repetition_penalty, no_repeat_ngram_size,
+    all_generated_tokens, visualize
+):
+    """Generate a single block using diffusion."""
+    mask_block = torch.full((1, block_size), config.mask_token_id, device=device)
+    is_masked = torch.ones(1, block_size, dtype=torch.bool, device=device)
+    block_token_history = []
+    for step_idx in range(steps):
+        full_input = torch.cat([context_ids, mask_block], dim=1)
+        attention_mask = torch.ones_like(full_input, dtype=torch.float32)
+        logits, _ = model(full_input, attention_mask=attention_mask)
+        block_logits = logits[:, -block_size:, :]
+        block_logits = _apply_sampling_controls(
+            block_logits, context_ids, mask_block, is_masked,
+            repetition_penalty, temperature, top_k, top_p,
+            no_repeat_ngram_size, block_token_history
+        )
+        probs = F.softmax(block_logits, dim=-1)
+        probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+        probs = probs.clamp(min=1e-10)
+        probs = probs / probs.sum(dim=-1, keepdim=True)
+        sampled_tokens = torch.multinomial(probs.view(-1, probs.size(-1)), num_samples=1)
+        sampled_tokens = sampled_tokens.view(1, block_size)
+        confidence = probs.gather(-1, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+        tokens_to_unmask = max(1, block_size // steps)
+        if step_idx == steps - 1:
+            tokens_to_unmask = is_masked.sum().item()
+        if tokens_to_unmask > 0 and is_masked.sum() > 0:
+            masked_confidence = confidence.clone()
+            masked_confidence[~is_masked] = -1.0
+            num_to_unmask = min(tokens_to_unmask, is_masked.sum().item())
+            _, top_indices = torch.topk(masked_confidence.view(-1), num_to_unmask)
+            for idx in top_indices:
+                mask_block[0, idx] = sampled_tokens[0, idx]
+                is_masked[0, idx] = False
+                block_token_history.append(sampled_tokens[0, idx].item())
+                all_generated_tokens.add(sampled_tokens[0, idx].item())
+        if visualize:
+            visualize_diffusion_state(tokenizer, context_ids, [mask_block], [is_masked], config, clear=(step_idx > 0))
+    return mask_block, block_token_history
+def _generate_parallel_blocks(
+    model, tokenizer, context_ids, config, device,
+    num_parallel, block_size, steps, temperature,
+    top_k, top_p, repetition_penalty, no_repeat_ngram_size,
+    all_generated_tokens, visualize
+):
+    """Generate multiple blocks in parallel using batched computation.
+    Each block sees all previous blocks in the sequence, maintaining proper order:
+    - Block 0: context + [block0]
+    - Block 1: context + [block0] + [block1]
+    - Block 2: context + [block0] + [block1] + [block2]
+    - etc.
+    This ensures sequential coherence while still benefiting from batched computation.
+    """
+    batch_size = num_parallel
+    context_len = context_ids.shape[1]
+    # Initialize mask blocks for all parallel blocks
+    # Shape: (num_parallel, block_size)
+    mask_blocks = torch.full((batch_size, block_size), config.mask_token_id, device=device)
+    is_masked = torch.ones(batch_size, block_size, dtype=torch.bool, device=device)
+    block_token_histories = [[] for _ in range(batch_size)]
+    for step_idx in range(steps):
+        # Build inputs with proper sequential structure
+        # Each batch item has context + all blocks up to and including its own position
+        # Block i sees: context + block_0 + block_1 + ... + block_i
+        # Create padded inputs - each batch item has different length
+        # We'll pad to the longest sequence (which is the last block)
+        max_seq_len = context_len + (num_parallel * block_size)
+        # Build full input for each batch item
+        full_inputs = []
+        attention_masks = []
+        for b in range(batch_size):
+            # This block sees: context + all previous blocks + its own block
+            seq_parts = [context_ids[0]]  # Start with context
+            # Add all blocks from 0 to b (inclusive)
+            for prev_b in range(b + 1):
+                seq_parts.append(mask_blocks[prev_b])
+            # Concatenate to form this batch item's input
+            batch_input = torch.cat(seq_parts, dim=0)  # (seq_len,)
+            current_len = batch_input.shape[0]
+            # Pad to max_seq_len
+            padding_needed = max_seq_len - current_len
+            if padding_needed > 0:
+                padding = torch.full((padding_needed,), config.pad_token_id, device=device)
+                batch_input = torch.cat([batch_input, padding], dim=0)
+            full_inputs.append(batch_input)
+            # Create attention mask (1 for real tokens, 0 for padding)
+            attn_mask = torch.zeros(max_seq_len, device=device)
+            attn_mask[:current_len] = 1.0
+            attention_masks.append(attn_mask)
+        # Stack into batched tensors
+        full_input = torch.stack(full_inputs, dim=0)  # (batch, max_seq_len)
+        attention_mask = torch.stack(attention_masks, dim=0)  # (batch, max_seq_len)
+        # Single forward pass for all blocks
+        logits, _ = model(full_input, attention_mask=attention_mask)
+        # Extract logits for each block's position
+        # Block b's logits are at positions [context_len + b*block_size : context_len + (b+1)*block_size]
+        block_logits_list = []
+        for b in range(batch_size):
+            start_pos = context_len + (b * block_size)
+            end_pos = start_pos + block_size
+            block_logits_list.append(logits[b, start_pos:end_pos, :])
+        block_logits = torch.stack(block_logits_list, dim=0)  # (batch, block_size, vocab)
+        # Apply sampling controls per batch item
+        for b in range(batch_size):
+            # Build context that includes previous blocks for repetition penalty
+            extended_context = context_ids
+            if b > 0:
+                prev_blocks = torch.cat([mask_blocks[pb:pb+1] for pb in range(b)], dim=1)
+                extended_context = torch.cat([context_ids, prev_blocks], dim=1)
+            block_logits[b:b+1] = _apply_sampling_controls(
+                block_logits[b:b+1],
+                extended_context,
+                mask_blocks[b:b+1],
+                is_masked[b:b+1],
+                repetition_penalty, temperature, top_k, top_p,
+                no_repeat_ngram_size, block_token_histories[b]
+            )
+        probs = F.softmax(block_logits, dim=-1)
+        probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+        probs = probs.clamp(min=1e-10)
+        probs = probs / probs.sum(dim=-1, keepdim=True)
+        # Sample for all batches
+        sampled_tokens = torch.multinomial(probs.view(-1, probs.size(-1)), num_samples=1)
+        sampled_tokens = sampled_tokens.view(batch_size, block_size)
+        confidence = probs.gather(-1, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+        tokens_to_unmask = max(1, block_size // steps)
+        if step_idx == steps - 1:
+            tokens_to_unmask = block_size  # Unmask all remaining
+        # Unmask for each batch item
+        for b in range(batch_size):
+            if is_masked[b].sum() > 0:
+                masked_confidence = confidence[b].clone()
+                masked_confidence[~is_masked[b]] = -1.0
+                num_to_unmask = min(tokens_to_unmask, is_masked[b].sum().item())
+                if num_to_unmask > 0:
+                    _, top_indices = torch.topk(masked_confidence, num_to_unmask)
+                    for idx in top_indices:
+                        mask_blocks[b, idx] = sampled_tokens[b, idx]
+                        is_masked[b, idx] = False
+                        block_token_histories[b].append(sampled_tokens[b, idx].item())
+        if visualize:
+            # Visualize all blocks with different colors
+            block_list = [mask_blocks[b:b+1] for b in range(batch_size)]
+            is_masked_list = [is_masked[b:b+1] for b in range(batch_size)]
+            visualize_diffusion_state(
+                tokenizer, context_ids, block_list, is_masked_list,
+                config, clear=(step_idx > 0)
+            )
+    # Return list of generated blocks
+    return [mask_blocks[b:b+1] for b in range(batch_size)]
+def _apply_sampling_controls(
+    block_logits, context_ids, mask_block, is_masked,
+    repetition_penalty, temperature, top_k, top_p,
+    no_repeat_ngram_size, block_token_history
+):
+    """Apply repetition penalty, temperature, top-k, top-p, and n-gram blocking."""
+    if repetition_penalty != 1.0:
+        seen_tokens = set(context_ids[0].tolist())
+        for i in range(mask_block.shape[1]):
+            if not is_masked[0, i]:
+                seen_tokens.add(mask_block[0, i].item())
+        for token_id in seen_tokens:
+            if token_id < block_logits.shape[-1]:
+                if block_logits[0, :, token_id].mean() > 0:
+                    block_logits[:, :, token_id] /= repetition_penalty
+                else:
+                    block_logits[:, :, token_id] *= repetition_penalty
+    block_logits = block_logits / temperature
+    if top_k > 0:
+        top_k_logits, top_k_indices = torch.topk(block_logits, top_k, dim=-1)
+        block_logits = torch.full_like(block_logits, float('-inf'))
+        block_logits.scatter_(-1, top_k_indices, top_k_logits)
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(block_logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+        block_logits[indices_to_remove] = float('-inf')
+    if no_repeat_ngram_size > 0 and len(block_token_history) >= no_repeat_ngram_size - 1:
+        recent_ngram = tuple(block_token_history[-(no_repeat_ngram_size-1):])
+        full_history = context_ids[0].tolist() + block_token_history
+        for i in range(len(full_history) - no_repeat_ngram_size + 1):
+            if tuple(full_history[i:i+no_repeat_ngram_size-1]) == recent_ngram:
+                blocked_token = full_history[i + no_repeat_ngram_size - 1]
+                if blocked_token < block_logits.shape[-1]:
+                    block_logits[:, :, blocked_token] = float('-inf')
+    # Safety check: if all logits are -inf, reset to uniform distribution
+    all_inf_mask = torch.isinf(block_logits).all(dim=-1)
+    if all_inf_mask.any():
+        block_logits[all_inf_mask] = 0.0
+    return block_logits
+# ============== Main Entry Point ==============
+def main():
+    """Main inference function."""
+    # Configuration
+    model_path = "../extra-final-boss/checkpoints/model_fp32.pt"
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+    # Allow a quick demo mode to test visualization without loading the model
+    import sys
+    if len(sys.argv) > 1 and sys.argv[1] == 'demo':
+        demo_visualize_truncation()
+        return
+    # Load tokenizer
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load model
+    model, config = load_model(model_path, device)
+    # Generate text
+    print("\n" + "=" * 50)
+    print("Text Generation")
+    print("=" * 50)
+    prompt = "Barrack Obama was born in "
+    print(f"Prompt: {prompt}\n")
+    # Set visualize=True to see real-time diffusion effect
+    visualize = True
+    parallel_blocks = 4  # Generate 2-4 blocks in parallel for speedup
+    generated = generate_block_diffusion(
+        model,
+        tokenizer,
+        prompt=prompt,
+        steps=64,
+        block_size=64,
+        max_new_tokens=512,
+        device=device,
+        temperature=1,
+        top_k=40,
+        top_p=0.9,
+        repetition_penalty=1.3,
+        no_repeat_ngram_size=3,
+        visualize=visualize,
+        parallel_blocks=parallel_blocks,
+    )
+    print(f"\nGenerated text:\n{generated}")
+if __name__ == "__main__":
+    main()

infer-chat.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import os
+import sys
+import time
+import argparse
+import importlib.util
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+# Tracks how many lines the last visualization printed so we can overwrite it
+_visualize_last_lines = 0
+def try_import_infer_base(base_path: str):
+    """Dynamically import `infer-base.py` as a module and return it, or None on failure."""
+    if not os.path.exists(base_path):
+        return None
+    try:
+        spec = importlib.util.spec_from_file_location("infer_base", base_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+    except Exception as e:
+        print(f"Warning: failed to import {base_path}: {e}")
+        return None
+def load_finetuned_model(model_path: str, device: str = 'cuda'):
+    """Load a saved fine-tuned model for inference."""
+    print(f"Loading model from {model_path}...")
+    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+    config = checkpoint['config']
+    # Create model
+    model = DiffusionLLM(config)
+    # Load weights
+    state_dict = checkpoint['model_state']
+    state_dict = {k: v.float() for k, v in state_dict.items()}
+    model.load_state_dict(state_dict)
+    model = model.to(device)
+    model.eval()
+    num_params = sum(p.numel() for p in model.parameters()) / 1e6
+    print(f"✓ Loaded model: {num_params:.1f}M parameters")
+    # Print training info if available
+    if 'step' in checkpoint:
+        print(f"  Trained for {checkpoint['step']} steps")
+    if 'best_val_loss' in checkpoint:
+        print(f"  Best validation loss: {checkpoint['best_val_loss']:.4f}")
+    return model, config
+@torch.no_grad()
+def generate_block_diffusion(
+        model,
+        tokenizer,
+        prompt: str,
+        steps: int = 32,
+        block_size: int = 32,
+        max_new_tokens: int = 128,
+        device: str = 'cuda',
+        temperature: float = 0.8,
+        top_k: int = 50,
+        top_p: float = 0.9,
+        repetition_penalty: float = 1.2,
+        no_repeat_ngram_size: int = 3,
+        verbose: bool = True,
+        visualize_fn=None,
+        parallel_blocks: int = 1,
+):
+    """
+    Generate text using block diffusion with sampling controls.
+    If `visualize_fn` is provided it will be called as:
+            visualize_fn(tokenizer, context_ids, mask_block, is_masked, config, clear=True)
+    Returns the decoded generated string (including prompt).
+    """
+    model.eval()
+    # Encode prompt
+    prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Get model config
+    config = model.module.config if hasattr(model, 'module') else getattr(model, 'config', None)
+    if hasattr(model, '_orig_mod'):
+        config = model._orig_mod.config
+    if config is None:
+        raise RuntimeError("Could not determine model config")
+    num_blocks = max_new_tokens // block_size
+    parallel_blocks = min(parallel_blocks, num_blocks)
+    if verbose:
+        print(f"Generating {num_blocks} blocks of {block_size} tokens ({max_new_tokens} max_new_tokens)\n")
+    context_ids = prompt_ids
+    all_generated_tokens = set(prompt_ids[0].tolist())
+    blocks_generated = 0
+    while blocks_generated < num_blocks:
+        current_parallel = min(parallel_blocks, num_blocks - blocks_generated)
+        if current_parallel > 1:
+            new_blocks = _generate_parallel_blocks(
+                model, tokenizer, context_ids, config, device,
+                current_parallel, block_size, steps, temperature,
+                top_k, top_p, repetition_penalty, no_repeat_ngram_size,
+                all_generated_tokens, visualize_fn
+            )
+            for block in new_blocks:
+                context_ids = torch.cat([context_ids, block], dim=1)
+                blocks_generated += 1
+        else:
+            mask_block, block_token_history = _generate_single_block(
+                model, tokenizer, context_ids, config, device,
+                block_size, steps, temperature, top_k, top_p,
+                repetition_penalty, no_repeat_ngram_size,
+                all_generated_tokens, visualize_fn
+            )
+            context_ids = torch.cat([context_ids, mask_block], dim=1)
+            blocks_generated += 1
+    generated_ids = context_ids[0].tolist()
+    return tokenizer.decode(generated_ids, skip_special_tokens=False)
+def _apply_sampling_controls(
+    block_logits, context_ids, mask_block, is_masked,
+    repetition_penalty, temperature, top_k, top_p,
+    no_repeat_ngram_size, block_token_history
+):
+    """Apply repetition penalty, temperature, top-k, top-p, and n-gram blocking."""
+    if repetition_penalty != 1.0:
+        seen_tokens = set(context_ids[0].tolist())
+        for i in range(mask_block.shape[1]):
+            if not is_masked[0, i]:
+                seen_tokens.add(mask_block[0, i].item())
+        for token_id in seen_tokens:
+            if token_id < block_logits.shape[-1]:
+                avg = block_logits[0, :, token_id].mean()
+                if avg > 0:
+                    block_logits[:, :, token_id] /= repetition_penalty
+                else:
+                    block_logits[:, :, token_id] *= repetition_penalty
+    block_logits = block_logits / temperature
+    if top_k > 0:
+        k = min(top_k, block_logits.size(-1))
+        top_k_logits, top_k_indices = torch.topk(block_logits, k, dim=-1)
+        filtered = torch.full_like(block_logits, float('-inf'))
+        filtered.scatter_(-1, top_k_indices, top_k_logits)
+        block_logits = filtered
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(block_logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+        block_logits[indices_to_remove] = float('-inf')
+    if no_repeat_ngram_size > 0 and len(block_token_history) >= no_repeat_ngram_size - 1:
+        recent_ngram = tuple(block_token_history[-(no_repeat_ngram_size - 1):])
+        full_history = context_ids[0].tolist() + block_token_history
+        for i in range(len(full_history) - no_repeat_ngram_size + 1):
+            if tuple(full_history[i:i + no_repeat_ngram_size - 1]) == recent_ngram:
+                blocked_token = full_history[i + no_repeat_ngram_size - 1]
+                if blocked_token < block_logits.shape[-1]:
+                    block_logits[:, :, blocked_token] = float('-inf')
+    # Safety: reset if all logits are -inf
+    all_inf_mask = torch.isinf(block_logits).all(dim=-1)
+    if all_inf_mask.any():
+        block_logits[all_inf_mask] = 0.0
+    return block_logits
+def _generate_single_block(
+    model, tokenizer, context_ids, config, device,
+    block_size, steps, temperature, top_k, top_p,
+    repetition_penalty, no_repeat_ngram_size,
+    all_generated_tokens, visualize_fn=None
+):
+    """Generate a single block using diffusion."""
+    mask_block = torch.full((1, block_size), config.mask_token_id, device=device)
+    is_masked = torch.ones(1, block_size, dtype=torch.bool, device=device)
+    block_token_history = []
+    for step_idx in range(steps):
+        full_input = torch.cat([context_ids, mask_block], dim=1)
+        attention_mask = torch.ones_like(full_input, dtype=torch.float32)
+        logits, _ = model(full_input, attention_mask=attention_mask)
+        block_logits = logits[:, -block_size:, :]
+        block_logits = _apply_sampling_controls(
+            block_logits, context_ids, mask_block, is_masked,
+            repetition_penalty, temperature, top_k, top_p,
+            no_repeat_ngram_size, block_token_history
+        )
+        probs = F.softmax(block_logits, dim=-1)
+        probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+        probs = probs.clamp(min=1e-10)
+        probs = probs / probs.sum(dim=-1, keepdim=True)
+        sampled_tokens = torch.multinomial(probs.view(-1, probs.size(-1)), num_samples=1)
+        sampled_tokens = sampled_tokens.view(1, block_size)
+        confidence = probs.gather(-1, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+        tokens_to_unmask = max(1, block_size // steps)
+        if step_idx == steps - 1:
+            tokens_to_unmask = int(is_masked.sum().item())
+        if tokens_to_unmask > 0 and is_masked.sum() > 0:
+            masked_confidence = confidence.clone()
+            masked_confidence[~is_masked] = -1.0
+            num_to_unmask = min(int(tokens_to_unmask), int(is_masked.sum().item()))
+            _, top_indices = torch.topk(masked_confidence.view(-1), num_to_unmask)
+            for idx in top_indices:
+                idx = int(idx.item())
+                mask_block[0, idx] = sampled_tokens[0, idx]
+                is_masked[0, idx] = False
+                block_token_history.append(sampled_tokens[0, idx].item())
+                all_generated_tokens.add(sampled_tokens[0, idx].item())
+        if callable(visualize_fn):
+            try:
+                visualize_fn(tokenizer, context_ids, mask_block, is_masked, config, clear=(step_idx > 0))
+            except Exception:
+                pass
+        elif visualize_fn:
+            visualize_diffusion_state_local(tokenizer, context_ids, mask_block, is_masked, config, clear=(step_idx > 0))
+    return mask_block, block_token_history
+def _generate_parallel_blocks(
+    model, tokenizer, context_ids, config, device,
+    num_parallel, block_size, steps, temperature,
+    top_k, top_p, repetition_penalty, no_repeat_ngram_size,
+    all_generated_tokens, visualize_fn=None
+):
+    """Generate multiple blocks in parallel using batched computation.
+    Each block sees all previous blocks in the sequence, maintaining proper order:
+    - Block 0: context + [block0]
+    - Block 1: context + [block0] + [block1]
+    - Block 2: context + [block0] + [block1] + [block2]
+    - etc.
+    This ensures sequential coherence while still benefiting from batched computation.
+    """
+    batch_size = num_parallel
+    context_len = context_ids.shape[1]
+    # Initialize mask blocks for all parallel blocks
+    # Shape: (num_parallel, block_size)
+    mask_blocks = torch.full((batch_size, block_size), config.mask_token_id, device=device)
+    is_masked = torch.ones(batch_size, block_size, dtype=torch.bool, device=device)
+    block_token_histories = [[] for _ in range(batch_size)]
+    for step_idx in range(steps):
+        # Build inputs with proper sequential structure
+        # Each batch item has context + all previous blocks + its own block
+        # Block i sees: context + block_0 + block_1 + ... + block_i
+        # Create padded inputs - each batch item has different length
+        # We'll pad to the longest sequence (which is the last block)
+        max_seq_len = context_len + (num_parallel * block_size)
+        # Build full input for each batch item
+        full_inputs = []
+        attention_masks = []
+        for b in range(batch_size):
+            # This block sees: context + all previous blocks + its own block
+            seq_parts = [context_ids[0]]  # Start with context
+            # Add all blocks from 0 to b (inclusive)
+            for prev_b in range(b + 1):
+                seq_parts.append(mask_blocks[prev_b])
+            # Concatenate to form this batch item's input
+            batch_input = torch.cat(seq_parts, dim=0)  # (seq_len,)
+            current_len = batch_input.shape[0]
+            # Pad to max_seq_len
+            padding_needed = max_seq_len - current_len
+            if padding_needed > 0:
+                pad_token = config.pad_token_id if config.pad_token_id is not None else 0
+                padding = torch.full((padding_needed,), pad_token, device=device)
+                batch_input = torch.cat([batch_input, padding], dim=0)
+            full_inputs.append(batch_input)
+            # Create attention mask (1 for real tokens, 0 for padding)
+            attn_mask = torch.zeros(max_seq_len, device=device)
+            attn_mask[:current_len] = 1.0
+            attention_masks.append(attn_mask)
+        # Stack into batched tensors
+        full_input = torch.stack(full_inputs, dim=0)  # (batch, max_seq_len)
+        attention_mask = torch.stack(attention_masks, dim=0)  # (batch, max_seq_len)
+        # Single forward pass for all blocks
+        logits, _ = model(full_input, attention_mask=attention_mask)
+        # Extract logits for each block's position
+        # Block b's logits are at positions [context_len + b*block_size : context_len + (b+1)*block_size]
+        block_logits_list = []
+        for b in range(batch_size):
+            start_pos = context_len + (b * block_size)
+            end_pos = start_pos + block_size
+            block_logits_list.append(logits[b, start_pos:end_pos, :])
+        block_logits = torch.stack(block_logits_list, dim=0)  # (batch, block_size, vocab)
+        # Apply sampling controls per batch item
+        for b in range(batch_size):
+            # Build context that includes previous blocks for repetition penalty
+            extended_context = context_ids
+            if b > 0:
+                prev_blocks = mask_blocks[:b]
+                extended_context = torch.cat([context_ids] + [prev_blocks.view(1, -1)], dim=1)
+            block_logits[b:b+1] = _apply_sampling_controls(
+                block_logits[b:b+1],
+                extended_context,
+                mask_blocks[b:b+1],
+                is_masked[b:b+1],
+                repetition_penalty, temperature, top_k, top_p,
+                no_repeat_ngram_size, block_token_histories[b]
+            )
+        probs = F.softmax(block_logits, dim=-1)
+        probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+        probs = probs.clamp(min=1e-10)
+        probs = probs / probs.sum(dim=-1, keepdim=True)
+        # Sample for all batches
+        sampled_tokens = torch.multinomial(probs.view(-1, probs.size(-1)), num_samples=1)
+        sampled_tokens = sampled_tokens.view(batch_size, block_size)
+        confidence = probs.gather(-1, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+        tokens_to_unmask = max(1, block_size // steps)
+        if step_idx == steps - 1:
+            tokens_to_unmask = block_size  # Unmask all remaining
+        # Unmask for each batch item
+        for b in range(batch_size):
+            if is_masked[b].sum() > 0:
+                masked_confidence = confidence[b]
+                masked_confidence = masked_confidence.clone()
+                masked_confidence[~is_masked[b]] = -1.0
+                num_to_unmask = min(int(tokens_to_unmask), int(is_masked[b].sum().item()))
+                _, top_indices = torch.topk(masked_confidence.view(-1), num_to_unmask)
+                for idx in top_indices:
+                    idx = int(idx.item())
+                    mask_blocks[b, idx] = sampled_tokens[b, idx]
+                    is_masked[b, idx] = False
+                    block_token_histories[b].append(sampled_tokens[b, idx].item())
+                    all_generated_tokens.add(sampled_tokens[b, idx].item())
+        if callable(visualize_fn):
+            try:
+                block_list = [mask_blocks[b:b+1] for b in range(batch_size)]
+                is_masked_list = [is_masked[b:b+1] for b in range(batch_size)]
+                visualize_fn(tokenizer, context_ids, block_list, is_masked_list, config, clear=(step_idx > 0))
+            except Exception:
+                pass
+        elif visualize_fn:
+            block_list = [mask_blocks[b:b+1] for b in range(batch_size)]
+            is_masked_list = [is_masked[b:b+1] for b in range(batch_size)]
+            visualize_diffusion_state_local(tokenizer, context_ids, block_list, is_masked_list, config, clear=(step_idx > 0))
+    # Return list of generated blocks
+    return [mask_blocks[b:b+1] for b in range(batch_size)]
+def chat(model, tokenizer, instruction: str, parallel_blocks: int = 1, **kwargs):
+    """Simple chat interface."""
+    device = next(model.parameters()).device
+    prompt = format_instruct_prompt(instruction)
+    generated = generate_block_diffusion(
+        model,
+        tokenizer,
+        prompt=prompt,
+        device=device,
+        parallel_blocks=parallel_blocks,
+        **kwargs
+    )
+    # Extract all assistant responses using ChatML tags
+    start_tag = "<|im_start|>assistant"
+    end_tag = "<|im_end|>"
+    resp_parts = []
+    pos = 0
+    while True:
+        start_idx = generated.find(start_tag, pos)
+        if start_idx == -1:
+            break
+        start_idx += len(start_tag)
+        end_idx = generated.find(end_tag, start_idx)
+        if end_idx == -1:
+            resp_parts.append(generated[start_idx:].strip())
+            break
+        resp_parts.append(generated[start_idx:end_idx].strip())
+        pos = end_idx + len(end_tag)
+    if resp_parts:
+        resp = "\n\n".join(p for p in resp_parts if p)
+    else:
+        # Fallback if no assistant tags found
+        resp = generated.replace("<|im_start|>assistant", "").replace("<|im_end|>", "").strip()
+    return generated, resp
+def format_instruct_prompt(instruction: str) -> str:
+    """Format instruction using a simple ChatML-like template."""
+    return (
+        "<|im_start|>system\n"
+        "Answer this question truthfully<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{instruction}\n"
+        "<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+def visualize_diffusion_state_local(tokenizer, context_ids, mask_blocks, is_masked_list, config, clear=True, block_colors=None):
+    """Local visualization copied from infer-base.py to ensure consistent terminal output."""
+    import sys
+    import os
+    # Default colors for different blocks (green, cyan, yellow, magenta)
+    DEFAULT_COLORS = ['\033[92m', '\033[96m', '\033[93m', '\033[95m']
+    MASK_COLOR = '\033[90m'  # Gray for masked tokens
+    RESET = '\033[0m'
+    # Normalize inputs to lists
+    if not isinstance(mask_blocks, list):
+        mask_blocks = [mask_blocks]
+        is_masked_list = [is_masked_list]
+    if block_colors is None:
+        block_colors = DEFAULT_COLORS
+    # Decode context (prompt + previously generated blocks) and replace newlines
+    try:
+        context_text = tokenizer.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
+    except Exception:
+        # Fallback to str
+        context_text = str(context_ids[0].tolist())
+    # Build visualization for all blocks
+    all_blocks_text = []
+    for block_idx, (mask_block, is_masked) in enumerate(zip(mask_blocks, is_masked_list)):
+        color = block_colors[block_idx % len(block_colors)]
+        block_tokens = mask_block[0].tolist()
+        block_color_tokens = []
+        for i, token_id in enumerate(block_tokens):
+            if is_masked[0, i]:
+                # Use block-specific color for masked tokens to distinguish blocks
+                block_color_tokens.append(f'{MASK_COLOR}██{RESET}')
+            else:
+                # Decode individual token; use block color for revealed tokens
+                try:
+                    token_text = tokenizer.decode([token_id], skip_special_tokens=False)
+                except Exception:
+                    token_text = str(int(token_id))
+                block_color_tokens.append(f'{color}{token_text}{RESET}')
+        all_blocks_text.append(''.join(block_color_tokens))
+    # Join all blocks with a subtle separator
+    blocks_combined = ''.join(all_blocks_text)
+    # Overwrite previous visualization area (if any) by moving cursor up and clearing lines.
+    # This prevents accumulation of repeated frames in terminals like VSCode integrated terminal.
+    global _visualize_last_lines
+    if clear and _visualize_last_lines > 0:
+        try:
+            # Move cursor up to the start of the previous block
+            sys.stdout.write(f'\x1b[{_visualize_last_lines}A')
+            # Clear each line that was previously printed
+            for _ in range(_visualize_last_lines):
+                sys.stdout.write('\x1b[2K')  # Erase entire line
+                sys.stdout.write('\x1b[1B')  # Move cursor down one line
+            # Move cursor back to the top of cleared region
+            sys.stdout.write(f'\x1b[{_visualize_last_lines}A')
+            sys.stdout.flush()
+        except Exception:
+            # Fallback to whole-screen clear
+            try:
+                sys.stdout.write('\x1b[2J\x1b[H')
+                sys.stdout.flush()
+            except Exception:
+                try:
+                    clear_cmd = 'cls' if os.name == 'nt' else 'clear'
+                    os.system(clear_cmd)
+                except Exception:
+                    sys.stdout.write('\r\033[K')
+                    sys.stdout.flush()
+    elif clear:
+        # No previous region to overwrite; do a simple ANSI clear to start fresh
+        try:
+            sys.stdout.write('\x1b[2J\x1b[H')
+            sys.stdout.flush()
+        except Exception:
+            try:
+                clear_cmd = 'cls' if os.name == 'nt' else 'clear'
+                os.system(clear_cmd)
+            except Exception:
+                sys.stdout.write('\r\033[K')
+                sys.stdout.flush()
+    # Print legend for parallel blocks
+    if len(mask_blocks) > 1:
+        legend_parts = []
+        for i in range(len(mask_blocks)):
+            color = block_colors[i % len(block_colors)]
+            legend_parts.append(f'{color}Block {i+1}{RESET}')
+        print(f"Generating: {' | '.join(legend_parts)}\n")
+    # Print the full context with colored blocks
+    # Ensure trailing newline so subsequent clears have predictable behavior
+    out_text = f"{context_text}{blocks_combined}\n"
+    try:
+        sys.stdout.write(out_text)
+        sys.stdout.flush()
+    except Exception:
+        print(out_text, flush=True)
+    # Update last-lines counter so next frame can overwrite this one
+    try:
+        _visualize_last_lines = out_text.count('\n') + (1 if len(mask_blocks) > 1 else 0) + 1
+    except Exception:
+        _visualize_last_lines = out_text.count('\n')
+def main():
+    base_path = os.path.join(os.path.dirname(__file__), "infer-base.py")
+    base_mod = try_import_infer_base(base_path)
+    if base_mod is None or not hasattr(base_mod, 'DiffusionLLM'):
+        raise RuntimeError("DiffusionLLM not found in infer-base.py")
+    DiffusionLLM = base_mod.DiffusionLLM
+    # Workaround for torch.load pickling
+    try:
+        main_mod = sys.modules.get('__main__')
+        if main_mod is not None:
+            if hasattr(base_mod, 'ModelConfig'):
+                setattr(main_mod, 'ModelConfig', base_mod.ModelConfig)
+            setattr(main_mod, 'DiffusionLLM', DiffusionLLM)
+    except Exception:
+        pass
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="./checkpoints/model_fp32.pt", help="Path to model checkpoint")
+    parser.add_argument("--tokenizer", type=str, default="Qwen/Qwen2.5-0.5B", help="Tokenizer model id or path")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--visualize", action="store_true", default=False, help="Enable visualization during generation")
+    parser.add_argument("--steps", type=int, default=64)
+    parser.add_argument("--block_size", type=int, default=128)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    parser.add_argument("--parallel_blocks", type=int, default=1, help="Number of blocks to generate in parallel")
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    print(f"Using device: {device}")
+    # Load tokenizer
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+    if tokenizer.pad_token is None:
+        # set pad token if not present
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load model
+    best_model_path = "checkpoints/best_model.pt"
+    if os.path.exists(best_model_path):
+        print("Loading best model...")
+        model, config = load_finetuned_model(best_model_path, device)
+    else:
+        model, config = load_finetuned_model(args.model, device)
+    # Use the local visualization implementation for consistency
+    visualize_fn = None
+    if args.visualize:
+        visualize_fn = visualize_diffusion_state_local
+    print("Ready. Type a message and press Enter (empty line to quit).\n")
+    while True:
+        try:
+            user_input = input("User: ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\nExiting.")
+            break
+        if user_input == "":
+            print("Goodbye.")
+            break
+        raw_output, response = chat(
+            model,
+            tokenizer,
+            user_input,
+            steps=args.steps,
+            block_size=args.block_size,
+            max_new_tokens=args.max_new_tokens,
+            temperature=0.8,
+            top_k=50,
+            top_p=0.9,
+            repetition_penalty=1.2,
+            no_repeat_ngram_size=3,
+            verbose=False,
+            visualize_fn=visualize_fn,
+            parallel_blocks=args.parallel_blocks,
+        )
+        print("\nRaw Output:\n")
+        print(raw_output)
+        print("\nAssistant:\n")
+        print(response)
+        print("\n" + ("=" * 60) + "\n")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+flask>=2.0
+transformers>=4.0.0
+torch
+sentencepiece
+flask_cors

static/ai.mp4 ADDED Viewed

Binary file (77.4 kB). View file

static/index.html ADDED Viewed

	@@ -0,0 +1,156 @@

+<!doctype html>
+<html lang="en">
+    <head>
+        <meta charset="utf-8" />
+        <meta name="viewport" content="width=device-width, initial-scale=1" />
+        <title>Diffusion LLM – Chat</title>
+        <!-- Tailwind CDN -->
+        <script src="https://cdn.tailwindcss.com"></script>
+        <!-- Inter Font -->
+        <link rel="preconnect" href="https://fonts.gstatic.com">
+        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
+        <style>
+        html,body{font-family:Inter,ui-sans-serif,system-ui,-apple-system,'Segoe UI',Roboto,'Helvetica Neue',Arial}
+            /* custome slider */
+            input[type=range] {
+                -webkit-appearance: none;
+                width: 100%;
+                height: 6px;
+                border-radius: 5px;
+                background: #e0e0e0;
+                outline: none;
+            }
+            input[type=range]::-webkit-slider-thumb {
+                -webkit-appearance: none;
+                appearance: none;
+                width: 16px;
+                height: 16px;
+                border-radius: 50%;
+                background: #6b21a8;
+                cursor: pointer;
+                box-shadow: 0 0 2px rgba(0,0,0,0.2);
+                transition: background 0.3s ease;
+            }
+            input[type=range]::-webkit-slider-thumb:hover {
+                background: #7c2dbe;
+            }
+            input[type=range]::-moz-range-thumb {
+                width: 16px;
+                height: 16px;
+                border-radius: 50%;
+                background: #6b21a8;
+                cursor: pointer;
+                box-shadow: 0 0 2px rgba(0,0,0,0.2);
+                transition: background 0.3s ease;
+            }
+            input[type=range]::-moz-range-thumb:hover {
+                background: #7c2dbe;
+            }
+        </style>
+    </head>
+    <body>
+        <div class="h-screen w-screen flex items-start gap-6 p-8 bg-gradient-to-br from-purple-50 to-purple-100">
+            <!-- Sidebar -->
+            <aside id="sidebar" class="w-64 h-full bg-white/90 flex flex-col items-center justify-between backdrop-blur-sm rounded-xl p-5 shadow-sm border border-gray-100">
+                <div>
+                    <div class="flex items-center gap-3 mb-4">
+                        <div class="w-9 h-9 rounded-md bg-gradient-to-br from-purple-200 to-purple-300"></div>
+                        <div>
+                            <div class="text-sm font-semibold text-slate-900">Cortex</div>
+                            <div class="text-xs text-slate-500">Diffusion LLM</div>
+                        </div>
+                    </div>
+                    <button id="new-chat" class="w-full inline-flex items-center justify-center gap-2 bg-black text-white py-2 rounded-full text-sm font-medium shadow-sm mb-4">+ New chat</button>
+                    <nav class="w-full min-w-48 flex-1 flex flex-col gap-2 text-sm" id="chat-list" aria-label="Saved chats">
+                        <!-- Chat items are dynamically injected here by JavaScript -->
+                    </nav>
+                </div>
+                <div class="mt-6 text-xs text-slate-500 ">Signed in as <strong class="text-slate-700">you@example.com</strong></div>
+            </aside>
+            <!-- Main content -->
+            <main class="flex-1 flex items-center justify-center w-full h-full">
+                <div class="w-full bg-white rounded-2xl p-7 shadow-lg border border-gray-100 flex flex-col h-full">
+                    <header class="flex items-center justify-between mb-3 border-b border-gray-200 pb-3">
+                        <div class="flex items-center gap-3">
+                            <button id="btn-toggle-sidebar" aria-label="Toggle sidebar" class="inline-flex items-center justify-center p-2 rounded-md bg-white shadow sm:hidden">☰</button>
+                            <h1 id="app-title" class="text-lg font-semibold">Diffusion LLM Chat</h1>
+                        </div>
+                        <div class="flex items-center gap-3">
+                            <button id="btn-load" class="bg-black text-white px-3 py-2 rounded-md text-sm font-medium">Load Model</button>
+                            <span id="load-status" class="text-sm text-slate-500">Not loaded</span>
+                        </div>
+                    </header>
+                    <section class="flex-1 flex flex-col overflow-hidden">
+                        <div id="welcome" class="text-center py-6">
+                            <div class="mx-auto w-24 h-24">
+                                <video src="/static/ai.mp4" alt="Assistant Avatar" autoplay loop muted class="w-full h-full scale-[2] object-cover mix-blend-multiply" style="filter: hue-rotate(45deg)" />
+                            </div>
+                            <p class="mt-4 text-purple-600 font-medium">Hello, Jagrat Patel</p>
+                            <h2 class="mt-2 text-2xl font-semibold text-slate-900">How can I assist you today?</h2>
+                            <div class="mt-6 flex items-center justify-center gap-4 flex-wrap">
+                                <button class="bg-white px-5 py-3 rounded-lg shadow-sm border text-sm hover:scale-105 hover:bg-purple-50 hover:border-purple-300 transition-all">Deeper Research &nbsp;<span class="block text-xs text-slate-500 mt-1">Ask for long-form, research-backed answers.</span></button>
+                                <button class="bg-white px-5 py-3 rounded-lg shadow-sm border text-sm hover:scale-105 hover:bg-purple-50 hover:border-purple-300 transition-all">Saved prompts &nbsp;<span class="block text-xs text-slate-500 mt-1">Quickly reuse your favorite prompts.</span></button>
+                                <button class="bg-white px-5 py-3 rounded-lg shadow-sm border text-sm hover:scale-105 hover:bg-purple-50 hover:border-purple-300 transition-all">Check Facts &nbsp;<span class="block text-xs text-slate-500 mt-1">Compare GDPR vs CCPA differences.</span></button>
+                            </div>
+                        </div>
+                        <div id="chat" class="hidden flex-1 overflow-auto px-2 py-3" role="log" aria-live="polite">
+                            <!-- messages injected here -->
+                        </div>
+                    </section>
+                    <form id="prompt-form" class="mt-4 bg-white p-4 rounded-xl shadow-inner border border-gray-100" aria-label="Chat prompt">
+                        <div class="mb-4 flex flex-row gap-4 flex-wrap items-center justify-between">
+                            <div class="flex items-center gap-4 w-[24%]">
+                                <label for="steps" class="text-sm font-medium text-slate-700">Steps:</label>
+                                <input type="range" id="steps" min="1" max="100" value="64" class="flex-1">
+                                <span id="steps-value" class="text-sm text-slate-500 w-8">64</span>
+                            </div>
+                            <div class="flex items-center gap-4 w-[24%]">
+                                <label for="block_size" class="text-sm font-medium text-slate-700">Block Size:</label>
+                                <input type="range" id="block_size" min="8" max="256" value="128" step="8" class="flex-1">
+                                <span id="block_size-value" class="text-sm text-slate-500 w-8">128</span>
+                            </div>
+                            <div class="flex items-center gap-4 w-[24%]">
+                                <label for="max_new_tokens" class="text-sm font-medium text-slate-700">Max New Tokens:</label>
+                                <input type="range" id="max_new_tokens" min="32" max="1024" value="128" step="32" class="flex-1">
+                                <span id="max_new_tokens-value" class="text-sm text-slate-500 w-8">128</span>
+                            </div>
+                            <div class="flex items-center gap-4 w-[24%]">
+                                <label for="parallel_blocks" class="text-sm font-medium text-slate-700">Parallel Blocks:</label>
+                                <input type="range" id="parallel_blocks" min="1" max="4" value="1" step="1" class="flex-1">
+                                <span id="parallel_blocks-value" class="text-sm text-slate-500 w-8">1</span>
+                            </div>
+                        </div>
+                        <div class="flex gap-3">
+                            <textarea id="prompt" class="flex-1 resize-y rounded-lg border border-gray-200 p-3 text-sm focus:outline-none focus:ring-[1px] focus:ring-purple-500 focus:border-purple-500" placeholder="Ask me anything..." rows="2" aria-label="Message input"></textarea>
+                            <div class="flex flex-col justify-between">
+                                <button type="submit" id="btn-send" class="bg-black text-white px-4 py-2 rounded-md">Send</button>
+                            </div>
+                        </div>
+                    </form>
+                    <div class="mt-4 text-center text-xs text-slate-500">Model served by Flask. See README for run instructions.</div>
+                </div>
+            </main>
+        </div>
+        <script src="/static/main.js"></script>
+    </body>
+</html>

static/main.js ADDED Viewed

	@@ -0,0 +1,346 @@

+// Global state
+let isModelLoaded = false;
+// DOM Elements
+const els = {
+    chat: document.getElementById("chat"),
+    promptForm: document.getElementById("prompt-form"),
+    promptInput: document.getElementById("prompt"),
+    loadBtn: document.getElementById("btn-load"),
+    testStreamBtn: document.getElementById("btn-test-stream"),
+    status: document.getElementById("load-status"),
+    sidebar: document.getElementById("sidebar"),
+    sidebarToggle: document.getElementById("btn-toggle-sidebar"),
+    chatList: document.getElementById("chat-list"),
+    newChatBtn: document.getElementById("new-chat"),
+    sendBtn: document.getElementById("btn-send"),
+    steps: document.getElementById("steps"),
+    block_size: document.getElementById("block_size"),
+    max_new_tokens: document.getElementById("max_new_tokens"),
+    parallel_blocks: document.getElementById("parallel_blocks"),
+    stepsValue: document.getElementById("steps-value"),
+    block_sizeValue: document.getElementById("block_size-value"),
+    max_new_tokensValue: document.getElementById("max_new_tokens-value"),
+    parallel_blocksValue: document.getElementById("parallel_blocks-value"),
+};
+// Update slider values
+els.steps.addEventListener("input", () => {
+    els.stepsValue.textContent = els.steps.value;
+});
+els.block_size.addEventListener("input", () => {
+    els.block_sizeValue.textContent = els.block_size.value;
+});
+els.max_new_tokens.addEventListener("input", () => {
+    els.max_new_tokensValue.textContent = els.max_new_tokens.value;
+});
+els.parallel_blocks.addEventListener("input", () => {
+    els.parallel_blocksValue.textContent = els.parallel_blocks.value;
+});
+// --- Logic ---
+async function checkLoadStatus() {
+    try {
+        const res = await fetch("/api/load", {
+            method: "POST",
+            headers: { "Content-Type": "application/json" },
+            body: JSON.stringify({ check_only: true }),
+        });
+        if (res.ok) {
+            const data = await res.json();
+            if (data.loaded) {
+                isModelLoaded = true;
+                els.status.textContent = "Ready";
+                els.status.className = "text-sm text-green-600 font-medium";
+                els.loadBtn.style.display = 'none';
+            }
+        }
+    } catch (e) {
+        console.log("Model check failed:", e);
+    }
+}
+els.loadBtn.addEventListener("click", async () => {
+    els.loadBtn.disabled = true;
+    els.status.textContent = "Loading Model (this may take time)...";
+    els.status.className = "text-sm text-yellow-600 font-medium";
+    try {
+        const res = await fetch("/api/load", {
+            method: "POST",
+            headers: { "Content-Type": "application/json" },
+            body: JSON.stringify({ check_only: false }),
+        });
+        const data = await res.json();
+        if (res.ok) {
+            isModelLoaded = true;
+            els.status.textContent = "Model Loaded";
+            els.status.className = "text-sm text-green-600 font-medium";
+            els.loadBtn.style.display = 'none';
+        } else {
+            throw new Error(data.message || "Load failed");
+        }
+    } catch (e) {
+        els.status.textContent = "Error Loading";
+        els.status.className = "text-sm text-red-500";
+        alert("Error: " + e.message);
+    } finally {
+        els.loadBtn.disabled = false;
+    }
+});
+els.promptForm.addEventListener("submit", async (e) => {
+    e.preventDefault();
+    const text = els.promptInput.value.trim();
+    if (!text) return;
+    // UI Updates
+    addMessage("user", text);
+    els.promptInput.value = "";
+    // Create Assistant Bubble
+    const assistantBubble = addMessage("assistant", "");
+    const contentPre = assistantBubble.querySelector(".content");
+    const textContent = contentPre.querySelector(".text-content");
+    const visualizationDiv = document.createElement("div");
+    visualizationDiv.className = "visualization mb-2 font-mono text-xs";
+    // Loading spinner (SVG)
+    const spinner = document.createElement("div");
+    spinner.className = "flex items-center gap-2 text-slate-400";
+    spinner.innerHTML = `
+        <svg class="animate-spin h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
+            <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+            <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z"></path>
+        </svg>
+        <span class="text-xs">Generating...</span>
+    `;
+    visualizationDiv.appendChild(spinner);
+    contentPre.insertBefore(visualizationDiv, textContent);
+    // Disable send button
+    els.sendBtn.disabled = true;
+    els.sendBtn.textContent = "Generating...";
+    els.promptInput.disabled = true;
+    // Generate Request with Streaming
+    try {
+        const res = await fetch("/api/generate-stream", {
+            method: "POST",
+            headers: { "Content-Type": "application/json" },
+            body: JSON.stringify({
+                instruction: text,
+                steps: parseInt(els.steps.value),
+                block_size: parseInt(els.block_size.value),
+                max_new_tokens: parseInt(els.max_new_tokens.value),
+                parallel_blocks: parseInt(els.parallel_blocks.value),
+            }),
+        });
+        if (!res.ok) {
+            throw new Error(`Server Error ${res.status}`);
+        }
+        const reader = res.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = "";
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            buffer += decoder.decode(value, { stream: true });
+            const lines = buffer.split("\n");
+            buffer = lines.pop(); // Keep incomplete line in buffer
+            for (const line of lines) {
+                if (line.startsWith("data: ")) {
+                    const jsonStr = line.slice(6);
+                    if (jsonStr.trim()) {
+                        try {
+                            const data = JSON.parse(jsonStr);
+                            handleStreamEvent(data, visualizationDiv, textContent);
+                        } catch (e) {
+                            console.error("Failed to parse SSE data:", e);
+                        }
+                    }
+                }
+            }
+        }
+    } catch (error) {
+        if (textContent) textContent.textContent = `Error: ${error.message}`;
+    } finally {
+        els.sendBtn.disabled = false;
+        els.sendBtn.textContent = "Send";
+        els.promptInput.disabled = false;
+    }
+});
+function handleStreamEvent(data, visualizationDiv, textContent) {
+    if (data.type === "start") {
+        textContent.textContent = "";
+    } else if (data.type === "update") {
+        // Render visualization
+        renderVisualization(data.data, visualizationDiv);
+        scrollToBottom();
+    } else if (data.type === "complete") {
+        // Clear visualization and show final response
+        visualizationDiv.innerHTML = "";
+        textContent.textContent = data.response || "No response";
+        scrollToBottom();
+    } else if (data.type === "error") {
+        textContent.textContent = `Error: ${data.error}`;
+    }
+}
+function renderVisualization(vizData, container) {
+    // Clear previous content
+    container.innerHTML = "";
+    // Show context
+    const contextDiv = document.createElement("div");
+    contextDiv.className = "text-slate-600 mb-1";
+    contextDiv.textContent = vizData.context;
+    container.appendChild(contextDiv);
+    // Show blocks
+    const blocksDiv = document.createElement("div");
+    blocksDiv.classList.add("flex", "flex-wrap", "gap-0");
+    const blockColors = ["text-green-600", "text-cyan-600", "text-yellow-600", "text-purple-600"];
+    vizData.blocks.forEach((block, blockIdx) => {
+        const blockSpan = document.createElement("span");
+        blockSpan.className = blockColors[blockIdx % blockColors.length];
+        block.tokens.forEach((token) => {
+            if (token.type === "masked") {
+                const maskedSpan = document.createElement("span");
+                maskedSpan.className = blockColors[blockIdx % blockColors.length];
+                maskedSpan.innerText = token.text + " ";
+                blockSpan.appendChild(maskedSpan);
+            } else {
+                const textNode = document.createTextNode(token.text);
+                blockSpan.appendChild(textNode);
+            }
+        });
+        blocksDiv.appendChild(blockSpan);
+    });
+    container.appendChild(blocksDiv);
+    // Add legend if multiple blocks
+    if (vizData.num_blocks > 1) {
+        const legendDiv = document.createElement("div");
+        legendDiv.className = "text-xs text-slate-500 mt-1";
+        const legends = [];
+        for (let i = 0; i < vizData.num_blocks; i++) {
+            legends.push(`Block ${i + 1}`);
+        }
+        legendDiv.textContent = `Generating: ${legends.join(" | ")}`;
+        container.appendChild(legendDiv);
+    }
+}
+// --- UI Helpers ---
+function addMessage(role, text) {
+    const wrapper = document.createElement("div");
+    wrapper.className = "mb-6 max-w-[100%] flex flex-col";
+    const bubble = document.createElement("div");
+    const isUser = role === "user";
+    bubble.className = isUser ? "self-end bg-slate-900 text-white p-4 rounded-2xl rounded-tr-sm max-w-[85%]" : "self-start bg-white border border-gray-200 text-slate-800 p-4 rounded-2xl rounded-tl-sm max-w-[65%] whitespace-pre-wrap overflow-x-auto shadow-sm flex flex-wrap";
+    // Main Content container that holds the response text
+    const pre = document.createElement("div");
+    pre.className = "content whitespace-pre-wrap font-sans text-sm leading-relaxed";
+    // The actual text content
+    const textSpan = document.createElement("span");
+    textSpan.className = "text-content";
+    textSpan.textContent = text;
+    pre.appendChild(textSpan);
+    bubble.appendChild(pre);
+    wrapper.appendChild(bubble);
+    els.chat.appendChild(wrapper);
+    scrollToBottom();
+    // Hide welcome screen
+    const welcome = document.getElementById("welcome");
+    if (welcome) {
+        welcome.classList.add("hidden");
+    }
+    els.chat.classList.remove("hidden");
+    return bubble;
+}
+function scrollToBottom() {
+    els.chat.scrollTop = els.chat.scrollHeight;
+}
+// Sidebar Toggle
+els.sidebarToggle.addEventListener("click", () => {
+    els.sidebar.classList.toggle("-translate-x-full");
+});
+// New Chat Button
+els.newChatBtn.addEventListener("click", () => {
+    // Clear chat
+    els.chat.innerHTML = "";
+    els.chat.classList.add("hidden");
+    // Show welcome screen
+    const welcome = document.getElementById("welcome");
+    if (welcome) {
+        welcome.classList.remove("hidden");
+    }
+    // Clear input
+    els.promptInput.value = "";
+});
+// Initialize
+(async () => {
+    await checkLoadStatus();
+    if (!isModelLoaded) {
+        els.loadBtn.disabled = true;
+        els.status.textContent = "Loading Model (this may take time)...";
+        els.status.className = "text-sm text-yellow-600 font-medium";
+        try {
+            const res = await fetch("/api/load", {
+                method: "POST",
+                headers: { "Content-Type": "application/json" },
+                body: JSON.stringify({ check_only: false }),
+            });
+            const data = await res.json();
+            if (res.ok) {
+                isModelLoaded = true;
+                els.status.textContent = "Model Loaded";
+                els.status.className = "text-sm text-green-600 font-medium";
+                els.loadBtn.style.display = 'none';
+            } else {
+                throw new Error(data.message || "Load failed");
+            }
+        } catch (e) {
+            els.status.textContent = "Error Loading";
+            els.status.className = "text-sm text-red-500";
+        } finally {
+            els.loadBtn.disabled = false;
+        }
+    }
+})();
+els.chat.classList.add("hidden");