Spaces:

thejagstudio
/

diffusionGPT

Sleeping

App Files Files Community

thejagstudio commited on Nov 29, 2025

Commit

0265bc8

verified ·

1 Parent(s): 486838c

Update app.py

Browse files

Files changed (1) hide show

app.py +475 -475

app.py CHANGED Viewed

@@ -1,475 +1,475 @@
-import os
-import sys
-import json
-import time
-import importlib.util
-from pathlib import Path
-from flask import Flask, request, jsonify, Response, stream_with_context
-from flask_cors import CORS
-import torch
-from transformers import AutoTokenizer
-app = Flask(__name__, static_folder='static', static_url_path='/static')
-CORS(app)
-# Global state
-model = None
-tokenizer = None
-config = None
-device = None
-DiffusionLLM = None
-chat_function = None
-def find_file(filename, search_dirs=None):
-    """Find a file in current directory or parent directories."""
-    if search_dirs is None:
-        search_dirs = [
-            os.path.dirname(__file__),  # Current directory
-            os.path.dirname(os.path.dirname(__file__)),  # Parent directory
-            os.getcwd(),  # Working directory
-        ]
-    for directory in search_dirs:
-        filepath = os.path.join(directory, filename)
-        if os.path.exists(filepath):
-            print(f"Found {filename} at: {filepath}")
-            return filepath
-    return None
-def try_import_module(filepath, module_name):
-    """Dynamically import a Python file as a module."""
-    if not filepath or not os.path.exists(filepath):
-        return None
-    try:
-        # Add the directory to sys.path
-        module_dir = os.path.dirname(filepath)
-        if module_dir not in sys.path:
-            sys.path.insert(0, module_dir)
-        spec = importlib.util.spec_from_file_location(module_name, filepath)
-        if spec is None:
-            print(f"Could not create spec for {filepath}")
-            return None
-        module = importlib.util.module_from_spec(spec)
-        sys.modules[module_name] = module
-        spec.loader.exec_module(module)
-        print(f"Successfully imported {module_name} from {filepath}")
-        return module
-    except Exception as e:
-        print(f"Error importing {filepath}: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-def load_model_internal():
-    """Load the model and tokenizer."""
-    global model, tokenizer, config, device, DiffusionLLM, chat_function
-    if model is not None:
-        return True
-    try:
-        print("=" * 60)
-        print("Starting model loading process...")
-        print("=" * 60)
-        # Find and import infer-base.py
-        base_path = find_file("infer-base.py")
-        if base_path is None:
-            raise RuntimeError("Could not find infer-base.py. Make sure it's in the same directory as app.py or parent directory.")
-        print(f"\nImporting infer-base.py from: {base_path}")
-        base_mod = try_import_module(base_path, "infer_base")
-        if base_mod is None:
-            raise RuntimeError("Failed to import infer-base.py")
-        # Check for DiffusionLLM class
-        if not hasattr(base_mod, 'DiffusionLLM'):
-            print("Available attributes in infer_base:", dir(base_mod))
-            raise RuntimeError("DiffusionLLM class not found in infer-base.py")
-        DiffusionLLM = base_mod.DiffusionLLM
-        print("✓ Successfully loaded DiffusionLLM class")
-        # Find and import infer-chat.py
-        chat_path = find_file("infer-chat.py")
-        if chat_path is None:
-            raise RuntimeError("Could not find infer-chat.py")
-        print(f"\nImporting infer-chat.py from: {chat_path}")
-        chat_mod = try_import_module(chat_path, "infer_chat")
-        if chat_mod is None or not hasattr(chat_mod, 'chat'):
-            raise RuntimeError("Failed to import chat function from infer-chat.py")
-        chat_function = chat_mod.chat
-        print("✓ Successfully loaded chat function")
-        # Setup pickling workaround for torch.load
-        try:
-            if hasattr(base_mod, 'ModelConfig'):
-                sys.modules['__main__'].ModelConfig = base_mod.ModelConfig
-            sys.modules['__main__'].DiffusionLLM = DiffusionLLM
-            print("✓ Configured pickle support for model loading")
-        except Exception as e:
-            print(f"Warning: Could not setup pickle workaround: {e}")
-        # Set device
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"\n✓ Using device: {device}")
-        # Load tokenizer
-        print("\nLoading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        print("✓ Tokenizer loaded")
-        # Find model checkpoint
-        checkpoint_dirs = [
-            "checkpoints",
-            "../checkpoints",
-            "./checkpoints",
-            os.path.join(os.path.dirname(__file__), "checkpoints"),
-            os.path.join(os.path.dirname(__file__), "../checkpoints"),
-        ]
-        model_path = None
-        for checkpoint_dir in checkpoint_dirs:
-            best_path = os.path.join(checkpoint_dir, "best_model.pt")
-            fp32_path = os.path.join(checkpoint_dir, "model_fp32.pt")
-            if os.path.exists(best_path):
-                model_path = best_path
-                break
-            elif os.path.exists(fp32_path):
-                model_path = fp32_path
-                break
-        if model_path is None:
-            raise RuntimeError(
-                "Could not find model checkpoint. Looking for:\n"
-                "  - checkpoints/best_model.pt\n"
-                "  - checkpoints/model_fp32.pt\n"
-                f"Searched directories: {checkpoint_dirs}"
-            )
-        print(f"\n✓ Found model checkpoint: {model_path}")
-        print("Loading model weights (this may take a minute)...")
-        # Load model
-        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
-        config = checkpoint['config']
-        print("Creating model...")
-        model = DiffusionLLM(config)
-        print("Loading state dict...")
-        state_dict = checkpoint['model_state']
-        state_dict = {k: v.float() for k, v in state_dict.items()}
-        model.load_state_dict(state_dict)
-        model = model.to(device)
-        model.eval()
-        num_params = sum(p.numel() for p in model.parameters()) / 1e6
-        print(f"\n{'=' * 60}")
-        print(f"✓✓✓ MODEL LOADED SUCCESSFULLY ✓✓✓")
-        print(f"{'=' * 60}")
-        print(f"Parameters: {num_params:.1f}M")
-        if 'step' in checkpoint:
-            print(f"Training steps: {checkpoint['step']}")
-        if 'best_val_loss' in checkpoint:
-            print(f"Best validation loss: {checkpoint['best_val_loss']:.4f}")
-        print(f"{'=' * 60}\n")
-        return True
-    except Exception as e:
-        print("\n" + "=" * 60)
-        print("ERROR LOADING MODEL")
-        print("=" * 60)
-        print(f"Error: {e}")
-        import traceback
-        traceback.print_exc()
-        print("=" * 60 + "\n")
-        return False
-def create_streaming_visualizer():
-    """Create a visualizer that yields SSE events instead of printing to terminal."""
-    def visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
-        # Normalize inputs to lists
-        if not isinstance(mask_blocks, list):
-            mask_blocks = [mask_blocks]
-            is_masked_list = [is_masked_list]
-        # Decode context
-        try:
-            context_text = tok.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
-        except Exception:
-            context_text = str(context_ids[0].tolist())
-        # Build blocks visualization
-        all_blocks = []
-        for block_idx, (mask_block, is_masked) in enumerate(zip(mask_blocks, is_masked_list)):
-            block_tokens = mask_block[0].tolist()
-            block_data = []
-            for i, token_id in enumerate(block_tokens):
-                if is_masked[0, i]:
-                    block_data.append({
-                        'type': 'masked',
-                        'text': '███'
-                    })
-                else:
-                    try:
-                        token_text = tok.decode([token_id], skip_special_tokens=False)
-                    except Exception:
-                        token_text = str(int(token_id))
-                    block_data.append({
-                        'type': 'revealed',
-                        'text': token_text
-                    })
-            all_blocks.append({
-                'block_index': block_idx,
-                'tokens': block_data
-            })
-        # Return data structure that will be sent as SSE
-        return {
-            'context': context_text,
-            'blocks': all_blocks,
-            'num_blocks': len(mask_blocks)
-        }
-    return visualizer
-@app.route('/')
-def index():
-    """Serve the main HTML page."""
-    return app.send_static_file('index.html')
-@app.route('/api/load', methods=['POST'])
-def load_model_endpoint():
-    """Load the model."""
-    data = request.json or {}
-    check_only = data.get('check_only', False)
-    global model
-    if check_only:
-        return jsonify({
-            'loaded': model is not None,
-            'message': 'Model is loaded' if model is not None else 'Model not loaded'
-        })
-    if model is not None:
-        return jsonify({
-            'loaded': True,
-            'message': 'Model already loaded'
-        })
-    success = load_model_internal()
-    if success:
-        return jsonify({
-            'loaded': True,
-            'message': 'Model loaded successfully'
-        })
-    else:
-        return jsonify({
-            'loaded': False,
-            'message': 'Failed to load model. Check server logs for details.'
-        }), 500
-@app.route('/api/generate', methods=['POST'])
-def generate():
-    """Generate response without streaming."""
-    global model, tokenizer, config, device, chat_function
-    if model is None:
-        return jsonify({'error': 'Model not loaded'}), 400
-    if chat_function is None:
-        return jsonify({'error': 'Chat function not available'}), 400
-    data = request.json
-    instruction = data.get('instruction', '')
-    steps = data.get('steps', 64)
-    block_size = data.get('block_size', 128)
-    max_new_tokens = data.get('max_new_tokens', 128)
-    parallel_blocks = data.get('parallel_blocks', 1)
-    if not instruction:
-        return jsonify({'error': 'No instruction provided'}), 400
-    try:
-        # Generate response
-        raw_output, response = chat_function(
-            model,
-            tokenizer,
-            instruction,
-            steps=steps,
-            block_size=block_size,
-            max_new_tokens=max_new_tokens,
-            temperature=0.8,
-            top_k=50,
-            top_p=0.9,
-            repetition_penalty=1.2,
-            no_repeat_ngram_size=3,
-            verbose=False,
-            visualize_fn=None,
-            parallel_blocks=parallel_blocks,
-        )
-        return jsonify({
-            'response': response,
-            'raw_output': raw_output
-        })
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return jsonify({'error': str(e)}), 500
-@app.route('/api/generate-stream', methods=['POST'])
-def generate_stream():
-    """Generate response with streaming visualization."""
-    global model, tokenizer, config, device, chat_function
-    if model is None:
-        return jsonify({'error': 'Model not loaded'}), 400
-    if chat_function is None:
-        return jsonify({'error': 'Chat function not available'}), 400
-    data = request.json
-    instruction = data.get('instruction', '')
-    steps = data.get('steps', 64)
-    block_size = data.get('block_size', 128)
-    max_new_tokens = data.get('max_new_tokens', 128)
-    parallel_blocks = data.get('parallel_blocks', 1)
-    if not instruction:
-        return jsonify({'error': 'No instruction provided'}), 400
-    def generate_events():
-        try:
-            # Import threading to allow yielding from callback
-            import queue
-            event_queue = queue.Queue()
-            generation_complete = {'done': False, 'result': None}
-            def streaming_visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
-                """This gets called during generation - we need to send events immediately"""
-                visualizer = create_streaming_visualizer()
-                data = visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear)
-                # Put the update in the queue so it can be yielded immediately
-                event_queue.put({'type': 'update', 'data': data})
-            # Start generation in a separate thread so we can yield events as they come
-            import threading
-            def run_generation():
-                try:
-                    raw_output, response = chat_function(
-                        model,
-                        tokenizer,
-                        instruction,
-                        steps=steps,
-                        block_size=block_size,
-                        max_new_tokens=max_new_tokens,
-                        temperature=0.8,
-                        top_k=50,
-                        top_p=0.9,
-                        repetition_penalty=1.2,
-                        no_repeat_ngram_size=3,
-                        verbose=False,
-                        visualize_fn=streaming_visualizer,
-                        parallel_blocks=parallel_blocks,
-                    )
-                    generation_complete['result'] = (raw_output, response)
-                except Exception as e:
-                    generation_complete['result'] = ('error', str(e))
-                finally:
-                    generation_complete['done'] = True
-                    event_queue.put(None)  # Signal completion
-            # Start generation thread
-            gen_thread = threading.Thread(target=run_generation)
-            gen_thread.daemon = True
-            gen_thread.start()
-            # Yield start event
-            yield f"data: {json.dumps({'type': 'start', 'message': 'Generation started'})}\n\n"
-            # Yield events as they come from the queue
-            while not generation_complete['done'] or not event_queue.empty():
-                try:
-                    event = event_queue.get(timeout=0.1)
-                    if event is None:  # Completion signal
-                        break
-                    yield f"data: {json.dumps(event)}\n\n"
-                except queue.Empty:
-                    continue
-            # Wait for thread to finish
-            gen_thread.join(timeout=1.0)
-            # Send final response
-            if generation_complete['result']:
-                raw_output, response = generation_complete['result']
-                if raw_output == 'error':
-                    yield f"data: {json.dumps({'type': 'error', 'error': response})}\n\n"
-                else:
-                    yield f"data: {json.dumps({'type': 'complete', 'response': response, 'raw_output': raw_output})}\n\n"
-        except Exception as e:
-            import traceback
-            traceback.print_exc()
-            yield f"data: {json.dumps({'type': 'error', 'error': str(e)})}\n\n"
-    return Response(
-        stream_with_context(generate_events()),
-        mimetype='text/event-stream',
-        headers={
-            'Cache-Control': 'no-cache',
-            'X-Accel-Buffering': 'no'
-        }
-    )
-@app.route('/api/test-stream', methods=['GET'])
-def test_stream():
-    """Test streaming endpoint."""
-    def generate():
-        for i in range(10):
-            yield f"data: {json.dumps({'message': f'Test message {i+1}'})}\n\n"
-            time.sleep(0.5)
-        yield f"data: {json.dumps({'message': 'Stream complete'})}\n\n"
-    return Response(
-        stream_with_context(generate()),
-        mimetype='text/event-stream',
-        headers={
-            'Cache-Control': 'no-cache',
-            'X-Accel-Buffering': 'no'
-        }
-    )
-if __name__ == '__main__':
-    app.run(debug=True, host='0.0.0.0', port=5000, threaded=True)

+import os
+import sys
+import json
+import time
+import importlib.util
+from pathlib import Path
+from flask import Flask, request, jsonify, Response, stream_with_context
+from flask_cors import CORS
+import torch
+from transformers import AutoTokenizer
+app = Flask(__name__, static_folder='static', static_url_path='/static')
+CORS(app)
+# Global state
+model = None
+tokenizer = None
+config = None
+device = None
+DiffusionLLM = None
+chat_function = None
+def find_file(filename, search_dirs=None):
+    """Find a file in current directory or parent directories."""
+    if search_dirs is None:
+        search_dirs = [
+            os.path.dirname(__file__),  # Current directory
+            os.path.dirname(os.path.dirname(__file__)),  # Parent directory
+            os.getcwd(),  # Working directory
+        ]
+    for directory in search_dirs:
+        filepath = os.path.join(directory, filename)
+        if os.path.exists(filepath):
+            print(f"Found {filename} at: {filepath}")
+            return filepath
+    return None
+def try_import_module(filepath, module_name):
+    """Dynamically import a Python file as a module."""
+    if not filepath or not os.path.exists(filepath):
+        return None
+    try:
+        # Add the directory to sys.path
+        module_dir = os.path.dirname(filepath)
+        if module_dir not in sys.path:
+            sys.path.insert(0, module_dir)
+        spec = importlib.util.spec_from_file_location(module_name, filepath)
+        if spec is None:
+            print(f"Could not create spec for {filepath}")
+            return None
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        print(f"Successfully imported {module_name} from {filepath}")
+        return module
+    except Exception as e:
+        print(f"Error importing {filepath}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def load_model_internal():
+    """Load the model and tokenizer."""
+    global model, tokenizer, config, device, DiffusionLLM, chat_function
+    if model is not None:
+        return True
+    try:
+        print("=" * 60)
+        print("Starting model loading process...")
+        print("=" * 60)
+        # Find and import infer-base.py
+        base_path = find_file("infer-base.py")
+        if base_path is None:
+            raise RuntimeError("Could not find infer-base.py. Make sure it's in the same directory as app.py or parent directory.")
+        print(f"\nImporting infer-base.py from: {base_path}")
+        base_mod = try_import_module(base_path, "infer_base")
+        if base_mod is None:
+            raise RuntimeError("Failed to import infer-base.py")
+        # Check for DiffusionLLM class
+        if not hasattr(base_mod, 'DiffusionLLM'):
+            print("Available attributes in infer_base:", dir(base_mod))
+            raise RuntimeError("DiffusionLLM class not found in infer-base.py")
+        DiffusionLLM = base_mod.DiffusionLLM
+        print("✓ Successfully loaded DiffusionLLM class")
+        # Find and import infer-chat.py
+        chat_path = find_file("infer-chat.py")
+        if chat_path is None:
+            raise RuntimeError("Could not find infer-chat.py")
+        print(f"\nImporting infer-chat.py from: {chat_path}")
+        chat_mod = try_import_module(chat_path, "infer_chat")
+        if chat_mod is None or not hasattr(chat_mod, 'chat'):
+            raise RuntimeError("Failed to import chat function from infer-chat.py")
+        chat_function = chat_mod.chat
+        print("✓ Successfully loaded chat function")
+        # Setup pickling workaround for torch.load
+        try:
+            if hasattr(base_mod, 'ModelConfig'):
+                sys.modules['__main__'].ModelConfig = base_mod.ModelConfig
+            sys.modules['__main__'].DiffusionLLM = DiffusionLLM
+            print("✓ Configured pickle support for model loading")
+        except Exception as e:
+            print(f"Warning: Could not setup pickle workaround: {e}")
+        # Set device
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"\n✓ Using device: {device}")
+        # Load tokenizer
+        print("\nLoading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        print("✓ Tokenizer loaded")
+        # Find model checkpoint
+        checkpoint_dirs = [
+            "checkpoints",
+            "../checkpoints",
+            "./checkpoints",
+            os.path.join(os.path.dirname(__file__), "checkpoints"),
+            os.path.join(os.path.dirname(__file__), "../checkpoints"),
+        ]
+        model_path = None
+        for checkpoint_dir in checkpoint_dirs:
+            best_path = os.path.join(checkpoint_dir, "best_model.pt")
+            fp32_path = os.path.join(checkpoint_dir, "model_fp32.pt")
+            if os.path.exists(best_path):
+                model_path = best_path
+                break
+            elif os.path.exists(fp32_path):
+                model_path = fp32_path
+                break
+        if model_path is None:
+            raise RuntimeError(
+                "Could not find model checkpoint. Looking for:\n"
+                "  - checkpoints/best_model.pt\n"
+                "  - checkpoints/model_fp32.pt\n"
+                f"Searched directories: {checkpoint_dirs}"
+            )
+        print(f"\n✓ Found model checkpoint: {model_path}")
+        print("Loading model weights (this may take a minute)...")
+        # Load model
+        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+        config = checkpoint['config']
+        print("Creating model...")
+        model = DiffusionLLM(config)
+        print("Loading state dict...")
+        state_dict = checkpoint['model_state']
+        state_dict = {k: v.float() for k, v in state_dict.items()}
+        model.load_state_dict(state_dict)
+        model = model.to(device)
+        model.eval()
+        num_params = sum(p.numel() for p in model.parameters()) / 1e6
+        print(f"\n{'=' * 60}")
+        print(f"✓✓✓ MODEL LOADED SUCCESSFULLY ✓✓✓")
+        print(f"{'=' * 60}")
+        print(f"Parameters: {num_params:.1f}M")
+        if 'step' in checkpoint:
+            print(f"Training steps: {checkpoint['step']}")
+        if 'best_val_loss' in checkpoint:
+            print(f"Best validation loss: {checkpoint['best_val_loss']:.4f}")
+        print(f"{'=' * 60}\n")
+        return True
+    except Exception as e:
+        print("\n" + "=" * 60)
+        print("ERROR LOADING MODEL")
+        print("=" * 60)
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+        print("=" * 60 + "\n")
+        return False
+def create_streaming_visualizer():
+    """Create a visualizer that yields SSE events instead of printing to terminal."""
+    def visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
+        # Normalize inputs to lists
+        if not isinstance(mask_blocks, list):
+            mask_blocks = [mask_blocks]
+            is_masked_list = [is_masked_list]
+        # Decode context
+        try:
+            context_text = tok.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
+        except Exception:
+            context_text = str(context_ids[0].tolist())
+        # Build blocks visualization
+        all_blocks = []
+        for block_idx, (mask_block, is_masked) in enumerate(zip(mask_blocks, is_masked_list)):
+            block_tokens = mask_block[0].tolist()
+            block_data = []
+            for i, token_id in enumerate(block_tokens):
+                if is_masked[0, i]:
+                    block_data.append({
+                        'type': 'masked',
+                        'text': '███'
+                    })
+                else:
+                    try:
+                        token_text = tok.decode([token_id], skip_special_tokens=False)
+                    except Exception:
+                        token_text = str(int(token_id))
+                    block_data.append({
+                        'type': 'revealed',
+                        'text': token_text
+                    })
+            all_blocks.append({
+                'block_index': block_idx,
+                'tokens': block_data
+            })
+        # Return data structure that will be sent as SSE
+        return {
+            'context': context_text,
+            'blocks': all_blocks,
+            'num_blocks': len(mask_blocks)
+        }
+    return visualizer
+@app.route('/')
+def index():
+    """Serve the main HTML page."""
+    return app.send_static_file('index.html')
+@app.route('/api/load', methods=['POST'])
+def load_model_endpoint():
+    """Load the model."""
+    data = request.json or {}
+    check_only = data.get('check_only', False)
+    global model
+    if check_only:
+        return jsonify({
+            'loaded': model is not None,
+            'message': 'Model is loaded' if model is not None else 'Model not loaded'
+        })
+    if model is not None:
+        return jsonify({
+            'loaded': True,
+            'message': 'Model already loaded'
+        })
+    success = load_model_internal()
+    if success:
+        return jsonify({
+            'loaded': True,
+            'message': 'Model loaded successfully'
+        })
+    else:
+        return jsonify({
+            'loaded': False,
+            'message': 'Failed to load model. Check server logs for details.'
+        }), 500
+@app.route('/api/generate', methods=['POST'])
+def generate():
+    """Generate response without streaming."""
+    global model, tokenizer, config, device, chat_function
+    if model is None:
+        return jsonify({'error': 'Model not loaded'}), 400
+    if chat_function is None:
+        return jsonify({'error': 'Chat function not available'}), 400
+    data = request.json
+    instruction = data.get('instruction', '')
+    steps = data.get('steps', 64)
+    block_size = data.get('block_size', 128)
+    max_new_tokens = data.get('max_new_tokens', 128)
+    parallel_blocks = data.get('parallel_blocks', 1)
+    if not instruction:
+        return jsonify({'error': 'No instruction provided'}), 400
+    try:
+        # Generate response
+        raw_output, response = chat_function(
+            model,
+            tokenizer,
+            instruction,
+            steps=steps,
+            block_size=block_size,
+            max_new_tokens=max_new_tokens,
+            temperature=0.8,
+            top_k=50,
+            top_p=0.9,
+            repetition_penalty=1.2,
+            no_repeat_ngram_size=3,
+            verbose=False,
+            visualize_fn=None,
+            parallel_blocks=parallel_blocks,
+        )
+        return jsonify({
+            'response': response,
+            'raw_output': raw_output
+        })
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/generate-stream', methods=['POST'])
+def generate_stream():
+    """Generate response with streaming visualization."""
+    global model, tokenizer, config, device, chat_function
+    if model is None:
+        return jsonify({'error': 'Model not loaded'}), 400
+    if chat_function is None:
+        return jsonify({'error': 'Chat function not available'}), 400
+    data = request.json
+    instruction = data.get('instruction', '')
+    steps = data.get('steps', 64)
+    block_size = data.get('block_size', 128)
+    max_new_tokens = data.get('max_new_tokens', 128)
+    parallel_blocks = data.get('parallel_blocks', 1)
+    if not instruction:
+        return jsonify({'error': 'No instruction provided'}), 400
+    def generate_events():
+        try:
+            # Import threading to allow yielding from callback
+            import queue
+            event_queue = queue.Queue()
+            generation_complete = {'done': False, 'result': None}
+            def streaming_visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
+                """This gets called during generation - we need to send events immediately"""
+                visualizer = create_streaming_visualizer()
+                data = visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear)
+                # Put the update in the queue so it can be yielded immediately
+                event_queue.put({'type': 'update', 'data': data})
+            # Start generation in a separate thread so we can yield events as they come
+            import threading
+            def run_generation():
+                try:
+                    raw_output, response = chat_function(
+                        model,
+                        tokenizer,
+                        instruction,
+                        steps=steps,
+                        block_size=block_size,
+                        max_new_tokens=max_new_tokens,
+                        temperature=0.8,
+                        top_k=50,
+                        top_p=0.9,
+                        repetition_penalty=1.2,
+                        no_repeat_ngram_size=3,
+                        verbose=False,
+                        visualize_fn=streaming_visualizer,
+                        parallel_blocks=parallel_blocks,
+                    )
+                    generation_complete['result'] = (raw_output, response)
+                except Exception as e:
+                    generation_complete['result'] = ('error', str(e))
+                finally:
+                    generation_complete['done'] = True
+                    event_queue.put(None)  # Signal completion
+            # Start generation thread
+            gen_thread = threading.Thread(target=run_generation)
+            gen_thread.daemon = True
+            gen_thread.start()
+            # Yield start event
+            yield f"data: {json.dumps({'type': 'start', 'message': 'Generation started'})}\n\n"
+            # Yield events as they come from the queue
+            while not generation_complete['done'] or not event_queue.empty():
+                try:
+                    event = event_queue.get(timeout=0.1)
+                    if event is None:  # Completion signal
+                        break
+                    yield f"data: {json.dumps(event)}\n\n"
+                except queue.Empty:
+                    continue
+            # Wait for thread to finish
+            gen_thread.join(timeout=1.0)
+            # Send final response
+            if generation_complete['result']:
+                raw_output, response = generation_complete['result']
+                if raw_output == 'error':
+                    yield f"data: {json.dumps({'type': 'error', 'error': response})}\n\n"
+                else:
+                    yield f"data: {json.dumps({'type': 'complete', 'response': response, 'raw_output': raw_output})}\n\n"
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            yield f"data: {json.dumps({'type': 'error', 'error': str(e)})}\n\n"
+    return Response(
+        stream_with_context(generate_events()),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'
+        }
+    )
+@app.route('/api/test-stream', methods=['GET'])
+def test_stream():
+    """Test streaming endpoint."""
+    def generate():
+        for i in range(10):
+            yield f"data: {json.dumps({'message': f'Test message {i+1}'})}\n\n"
+            time.sleep(0.5)
+        yield f"data: {json.dumps({'message': 'Stream complete'})}\n\n"
+    return Response(
+        stream_with_context(generate()),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'
+        }
+    )
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=7860, threaded=True)