Spaces:

thejagstudio
/

diffusionGPT

Sleeping

App Files Files Community

thejagstudio commited on 11 days ago

Commit

527cb39

verified ·

1 Parent(s): 01cf381

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -187

app.py CHANGED Viewed

@@ -487,9 +487,43 @@ import torch
 from transformers import AutoTokenizer
 import threading
 import queue
 app = Flask(__name__, static_folder='static', static_url_path='/static')
-CORS(app)
 # Global state
 model = None
@@ -498,15 +532,7 @@ config = None
 device = None
 DiffusionLLM = None
 chat_function = None
-optimized_pipeline = None  # For ONNX/IPEX
-# ==================== CONFIGURATION ====================
-USE_ONNX_RUNTIME = False  # Set True for ONNX (fastest)
-USE_IPEX = False  # Set True for Intel CPUs
-USE_TORCH_COMPILE = True  # Set True for PyTorch 2.0+ (good default)
-QUANTIZE_MODEL = True  # INT8/BF16 quantization
-WARMUP_ITERATIONS = 3  # Warmup for stable performance
-# =======================================================
 def find_file(filename, search_dirs=None):
     """Find a file in current directory or parent directories."""
@@ -531,53 +557,25 @@ def try_import_module(filepath, module_name):
         module_dir = os.path.dirname(filepath)
         if module_dir not in sys.path:
             sys.path.insert(0, module_dir)
         spec = importlib.util.spec_from_file_location(module_name, filepath)
         if spec is None:
             print(f"Could not create spec for {filepath}")
             return None
         module = importlib.util.module_from_spec(spec)
         sys.modules[module_name] = module
         spec.loader.exec_module(module)
-        print(f"Successfully imported {module_name} from {filepath}")
         return module
     except Exception as e:
-        print(f"Error importing {filepath}: {e}")
         if __debug__:
             import traceback
             traceback.print_exc()
         return None
-def configure_cpu_optimization():
-    """Configure CPU for maximum performance."""
-    print("\n" + "=" * 60)
-    print("CPU OPTIMIZATION CONFIGURATION")
-    print("=" * 60)
-    # Get CPU info
-    cpu_count = os.cpu_count()
-    physical_cores = cpu_count // 2 if cpu_count else cpu_count
-    # Optimal thread settings
-    threads = physical_cores or cpu_count or 1
-    torch.set_num_threads(threads)
-    torch.set_num_interop_threads(threads)
-    # Environment variables for MKL/OMP
-    os.environ["OMP_NUM_THREADS"] = str(threads)
-    os.environ["MKL_NUM_THREADS"] = str(threads)
-    os.environ["NUMEXPR_NUM_THREADS"] = str(threads)
-    print(f"✓ CPU Cores: {cpu_count} ({physical_cores} physical)")
-    print(f"✓ Threads: {threads}")
-    print(f"✓ OMP/MKL threads: {threads}")
-    # Intel-specific optimizations
-    if "intel" in torch.__version__.lower() or USE_IPEX:
-        torch.backends.quantized.engine = 'fbgemm'
-        print("✓ Intel FBGEMM backend enabled")
-    print("=" * 60 + "\n")
 def quantize_model(model):
     """Apply quantization for faster inference."""
     if not QUANTIZE_MODEL:
@@ -585,20 +583,13 @@ def quantize_model(model):
     print("\nApplying quantization...")
     try:
-        # Use torch.compile with quantization if available
-        if hasattr(torch, 'ao') and hasattr(torch.ao, 'quantization'):
-            # Dynamic quantization (fastest, no calibration needed)
-            model = torch.quantization.quantize_dynamic(
-                model,
-                {torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d},
-                dtype=torch.qint8
-            )
-            print("✓ Applied INT8 dynamic quantization")
-        elif USE_IPEX:
-            # IPEX BF16 optimization
-            model = model.to(torch.bfloat16)
-            print("✓ Applied BF16 precision (IPEX)")
         return model
     except Exception as e:
         print(f"⚠ Quantization failed: {e}")
@@ -606,124 +597,114 @@ def quantize_model(model):
 def compile_model(model):
     """Compile model for maximum speed."""
-    global USE_TORCH_COMPILE, USE_ONNX_RUNTIME
     print("\nCompiling model...")
-    # Option 1: ONNX Runtime (BEST performance)
     if USE_ONNX_RUNTIME:
         try:
             import onnxruntime as ort
-            # Export to ONNX (one-time cost, but worth it)
             onnx_path = "model_optimized.onnx"
             if not os.path.exists(onnx_path):
-                print("Exporting to ONNX format...")
-                dummy_input = torch.randint(0, 100, (1, 128))  # Adjust shape as needed
                 torch.onnx.export(
-                    model,
-                    dummy_input,
-                    onnx_path,
                     input_names=['input_ids'],
                     output_names=['logits'],
                     dynamic_axes={'input_ids': {0: 'batch', 1: 'sequence'}},
                     opset_version=16
                 )
-            # Create ONNX Runtime session
             sess_options = ort.SessionOptions()
             sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-            sess_options.enable_cpu_mem_arena = True
-            sess_options.enable_mem_pattern = True
-            # Use all cores
-            sess_options.intra_op_num_threads = torch.get_num_threads()
-            provider = 'CPUExecutionProvider'
-            compiled_model = ort.InferenceSession(onnx_path, sess_options, providers=[provider])
-            print("✓ ONNX Runtime compilation complete")
-            return compiled_model
         except Exception as e:
-            print(f"⚠ ONNX Runtime failed: {e}, using torch.compile")
-            USE_ONNX_RUNTIME = False
-    # Option 2: Intel IPEX
     if USE_IPEX:
         try:
             import intel_extension_for_pytorch as ipex
             model = ipex.optimize(model, dtype=torch.bfloat16, level="O3")
-            print("✓ Intel IPEX O3 optimization applied")
             return model
         except Exception as e:
             print(f"⚠ IPEX failed: {e}")
-            USE_IPEX = False
-    # Option 3: torch.compile (PyTorch 2.0+)
     if USE_TORCH_COMPILE and hasattr(torch, 'compile'):
         try:
-            # Use "max-autotune" for best performance
-            model = torch.compile(
-                model,
-                mode="max-autotune",
-                fullgraph=True,
-                backend="inductor"
-            )
-            print("✓ torch.compile (max-autotune) applied")
         except Exception as e:
-            print(f"⚠ torch.compile failed: {e}, using eager mode")
     return model
 def warmup_model(model, tokenizer, chat_func):
-    """Warmup the model for consistent performance."""
     if WARMUP_ITERATIONS == 0:
         return
     print("\nWarming up model...")
-    start_time = time.time()
     try:
         with torch.inference_mode():
             for i in range(WARMUP_ITERATIONS):
-                _ = chat_func(
-                    model, tokenizer, "Hello world",
-                    steps=8, block_size=32, max_new_tokens=16,
-                    temperature=0.8, top_k=50, top_p=0.9,
                     repetition_penalty=1.2, no_repeat_ngram_size=3,
-                    verbose=False, visualize_fn=None, parallel_blocks=2
                 )
-                print(f"  Warmup {i+1}/{WARMUP_ITERATIONS} complete")
     except Exception as e:
         print(f"⚠ Warmup failed: {e}")
-    print(f"✓ Warmup finished in {time.time() - start_time:.2f}s")
 def load_model_internal():
-    """Load the model with ultra-fast optimizations."""
-    global model, tokenizer, config, device, DiffusionLLM, chat_function, optimized_pipeline
     if model is not None:
         return True
     try:
-        print("=" * 60)
         print("ULTRA-FAST CPU MODEL LOADING")
-        print("=" * 60)
-        # Configure CPU
-        configure_cpu_optimization()
-        # Import modules
         base_path = find_file("infer-base.py")
         if base_path is None:
             raise RuntimeError("Could not find infer-base.py")
         base_mod = try_import_module(base_path, "infer_base")
-        if base_mod is None or not hasattr(base_mod, 'DiffusionLLM'):
             raise RuntimeError("DiffusionLLM class not found")
         DiffusionLLM = base_mod.DiffusionLLM
         chat_path = find_file("infer-chat.py")
         if chat_path is None:
             raise RuntimeError("Could not find infer-chat.py")
@@ -733,12 +714,13 @@ def load_model_internal():
             raise RuntimeError("Chat function not found")
         chat_function = chat_mod.chat
         # Device
         device = torch.device("cpu")
         # Load tokenizer
-        print("\nLoading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(
             "Qwen/Qwen2.5-0.5B",
             use_fast=True,
@@ -746,7 +728,7 @@ def load_model_internal():
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        print("✓ Fast tokenizer loaded")
         # Find model
         checkpoint_dirs = ["checkpoints", "../checkpoints", "./checkpoints"]
@@ -763,26 +745,27 @@ def load_model_internal():
         if model_path is None:
             raise RuntimeError("Model checkpoint not found")
-        print(f"\nLoading model from: {model_path}")
-        # Load checkpoint
         checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
         config = checkpoint['config']
-        # Create and load model
         model = DiffusionLLM(config)
-        state_dict = checkpoint['model_state']
-        # Convert to float32 for CPU
-        if not USE_IPEX:  # Keep FP32 for ONNX, use BF16 for IPEX
             state_dict = {k: v.float() for k, v in state_dict.items()}
         model.load_state_dict(state_dict)
         model.eval()
         # Apply optimizations
         model = quantize_model(model)
-        model = model.to(device)
         model = compile_model(model)
         # Warmup
@@ -790,24 +773,28 @@ def load_model_internal():
         # Print summary
         num_params = sum(p.numel() for p in model.parameters()) / 1e6
-        print(f"\n{'=' * 60}")
-        print(f"✓✓✓ MODEL LOADED & ULTRA-OPTIMIZED FOR CPU ✓✓✓")
-        print(f"{'=' * 60}")
-        print(f"Framework: {'ONNX Runtime' if USE_ONNX_RUNTIME else 'IPEX' if USE_IPEX else 'PyTorch'}")
         print(f"Parameters: {num_params:.1f}M")
-        print(f"CPU Threads: {torch.get_num_threads()}")
-        print(f"Quantization: {'INT8' if QUANTIZE_MODEL else 'BF16' if USE_IPEX else 'FP32'}")
         if 'step' in checkpoint:
             print(f"Training steps: {checkpoint['step']}")
-        print(f"{'=' * 60}\n")
         return True
     except Exception as e:
-        print(f"\nERROR LOADING MODEL: {e}")
         if __debug__:
             import traceback
             traceback.print_exc()
         return False
 def create_streaming_visualizer():
@@ -818,7 +805,6 @@ def create_streaming_visualizer():
             is_masked_list = [is_masked_list]
         try:
-            # Decode only once for efficiency
             context_text = tok.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
         except Exception:
             context_text = str(context_ids[0].tolist())
@@ -828,10 +814,9 @@ def create_streaming_visualizer():
             block_tokens = mask_block[0].tolist()
             block_data = []
-            # Batch decode for speed
             token_ids_to_decode = []
             positions = []
             for i, token_id in enumerate(block_tokens):
                 if not is_masked[0, i]:
                     token_ids_to_decode.append(token_id)
@@ -845,7 +830,7 @@ def create_streaming_visualizer():
             except Exception:
                 decoded_tokens = [str(int(tid)) for tid in token_ids_to_decode]
-            # Reconstruct block
             decoded_idx = 0
             for i, token_id in enumerate(block_tokens):
                 if is_masked[0, i]:
@@ -875,11 +860,11 @@ def index():
 @app.route('/api/load', methods=['POST'])
 def load_model_endpoint():
     """Load the model."""
     data = request.json or {}
     check_only = data.get('check_only', False)
-    global model
     if check_only:
         return jsonify({
             'loaded': model is not None,
@@ -906,8 +891,8 @@ def load_model_endpoint():
 @app.route('/api/generate', methods=['POST'])
 def generate():
-    """Generate response - optimized for minimal latency."""
-    global model, tokenizer, device, chat_function
     if model is None:
         return jsonify({'error': 'Model not loaded'}), 400
@@ -917,38 +902,25 @@ def generate():
     data = request.json
     instruction = data.get('instruction', '')
-    steps = data.get('steps', 16)  # Further reduced for speed
     block_size = data.get('block_size', 32)
     max_new_tokens = data.get('max_new_tokens', 64)
-    parallel_blocks = data.get('parallel_blocks', torch.get_num_threads())
     if not instruction:
         return jsonify({'error': 'No instruction provided'}), 400
     try:
-        # Fast path: no overhead
         with torch.inference_mode():
             raw_output, response = chat_function(
-                model,
-                tokenizer,
-                instruction,
-                steps=steps,
-                block_size=block_size,
-                max_new_tokens=max_new_tokens,
-                temperature=0.7,  # Slightly lower for faster sampling
-                top_k=50,
-                top_p=0.9,
-                repetition_penalty=1.2,
-                no_repeat_ngram_size=3,
-                verbose=False,
-                visualize_fn=None,
-                parallel_blocks=parallel_blocks,
             )
-        return jsonify({
-            'response': response,
-            'raw_output': raw_output
-        })
     except Exception as e:
         if __debug__:
@@ -958,36 +930,33 @@ def generate():
 @app.route('/api/generate-stream', methods=['POST'])
 def generate_stream():
-    """Generate with streaming - optimized with queue."""
     global model, tokenizer, chat_function
     if model is None:
         return jsonify({'error': 'Model not loaded'}), 400
-    if chat_function is None:
-        return jsonify({'error': 'Chat function not available'}), 400
     data = request.json
     instruction = data.get('instruction', '')
     steps = data.get('steps', 16)
     block_size = data.get('block_size', 32)
     max_new_tokens = data.get('max_new_tokens', 64)
-    parallel_blocks = data.get('parallel_blocks', torch.get_num_threads())
     if not instruction:
         return jsonify({'error': 'No instruction provided'}), 400
     def generate_events():
-        event_queue = queue.Queue(maxsize=100)  # Limit queue size
         generation_complete = {'done': False, 'result': None}
         def streaming_visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
             try:
                 visualizer = create_streaming_visualizer()
                 data = visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear)
-                event_queue.put({'type': 'update', 'data': data}, block=False)
             except queue.Full:
-                pass  # Drop updates if queue full (prevents memory issues)
         def run_generation():
             try:
@@ -1007,13 +976,13 @@ def generate_stream():
                 generation_complete['done'] = True
                 event_queue.put(None)
-        import threading
-        gen_thread = threading.Thread(target=run_generation)
-        gen_thread.daemon = True
         gen_thread.start()
-        yield f"data: {json.dumps({'type': 'start'})}\n\n"
         while not generation_complete['done'] or not event_queue.empty():
             try:
                 event = event_queue.get(timeout=0.1)
@@ -1025,12 +994,11 @@ def generate_stream():
         gen_thread.join(timeout=2.0)
         if generation_complete['result']:
             raw_output, response = generation_complete['result']
-            if raw_output == 'error':
-                yield f"data: {json.dumps({'type': 'error', 'error': response})}\n\n"
-            else:
-                yield f"data: {json.dumps({'type': 'complete', 'response': response})}\n\n"
     return Response(
         stream_with_context(generate_events()),
@@ -1044,35 +1012,28 @@ def generate_stream():
 @app.route('/api/status', methods=['GET'])
 def status():
-    """Get system status."""
     return jsonify({
         'model_loaded': model is not None,
         'torch_threads': torch.get_num_threads(),
         'interop_threads': torch.get_num_interop_threads(),
-        'cpu_count': os.cpu_count(),
         'optimizations': {
             'onnx_runtime': USE_ONNX_RUNTIME,
             'ipex': USE_IPEX,
             'torch_compile': USE_TORCH_COMPILE,
-            'quantization': QUANTIZE_MODEL
         }
     })
 if __name__ == '__main__':
     print("\n" + "=" * 70)
-    print("ULTRA-FAST CPU INFERENCE SERVER")
     print("=" * 70)
-    print("Available optimizations:")
-    print("  ✓ ONNX Runtime (best):", USE_ONNX_RUNTIME)
-    print("  ✓ Intel IPEX:", USE_IPEX)
-    print("  ✓ torch.compile:", USE_TORCH_COMPILE)
-    print("  ✓ Quantization:", QUANTIZE_MODEL)
-    print("  ✓ Multi-threading")
-    print("  ✓ Inference mode")
-    print("  ✓ Fast tokenizer")
-    print("  ✓ Memory layout optimization")
-    print("\nTo install ONNX Runtime: pip install onnxruntime")
-    print("To install Intel IPEX: pip install intel-extension-for-pytorch")
     print("=" * 70 + "\n")
     app.run(debug=False, host='0.0.0.0', port=7860, threaded=True)

 from transformers import AutoTokenizer
 import threading
 import queue
+import warnings
+# ============ CRITICAL: CONFIGURE THREADS BEFORE TORCH OPERATIONS ============
+# Must be set IMMEDIATELY at module import time
+def setup_cpu_threads():
+    """Configure CPU threads BEFORE any PyTorch parallel work starts."""
+    cpu_count = os.cpu_count() or 1
+    physical_cores = cpu_count // 2 if cpu_count > 1 else 1
+    # Set environment variables FIRST
+    os.environ["OMP_NUM_THREADS"] = str(physical_cores)
+    os.environ["MKL_NUM_THREADS"] = str(physical_cores)
+    os.environ["NUMEXPR_NUM_THREADS"] = str(physical_cores)
+    # Set PyTorch threads BEFORE any operations
+    try:
+        torch.set_num_threads(physical_cores)
+        torch.set_num_interop_threads(physical_cores)
+    except RuntimeError as e:
+        warnings.warn(f"Could not set threads: {e} (already initialized)")
+    print(f"✓ CPU threads configured: {physical_cores} physical cores")
+    return physical_cores
+# Call immediately
+PHYSICAL_CORES = setup_cpu_threads()
+# ============================================================================
+# Configuration flags
+USE_ONNX_RUNTIME = False
+USE_IPEX = False
+USE_TORCH_COMPILE = True
+QUANTIZE_MODEL = True
+WARMUP_ITERATIONS = 3
 app = Flask(__name__, static_folder='static', static_url_path='/static')
+CORS(app, resources={r"/api/*": {"origins": "*"}})  # More permissive for testing
 # Global state
 model = None
 device = None
 DiffusionLLM = None
 chat_function = None
+ModelConfig = None  # Will be imported from infer-base.py
 def find_file(filename, search_dirs=None):
     """Find a file in current directory or parent directories."""
         module_dir = os.path.dirname(filepath)
         if module_dir not in sys.path:
             sys.path.insert(0, module_dir)
         spec = importlib.util.spec_from_file_location(module_name, filepath)
         if spec is None:
             print(f"Could not create spec for {filepath}")
             return None
         module = importlib.util.module_from_spec(spec)
         sys.modules[module_name] = module
         spec.loader.exec_module(module)
+        print(f"✓ Successfully imported {module_name}")
         return module
     except Exception as e:
+        print(f"✗ Error importing {filepath}: {e}")
         if __debug__:
             import traceback
             traceback.print_exc()
         return None
 def quantize_model(model):
     """Apply quantization for faster inference."""
     if not QUANTIZE_MODEL:
     print("\nApplying quantization...")
     try:
+        # Dynamic quantization - no calibration needed, works on any model
+        model = torch.quantization.quantize_dynamic(
+            model,
+            {torch.nn.Linear, torch.nn.Conv1d, torch.nn.Embedding},
+            dtype=torch.qint8
+        )
+        print("✓ INT8 dynamic quantization applied")
         return model
     except Exception as e:
         print(f"⚠ Quantization failed: {e}")
 def compile_model(model):
     """Compile model for maximum speed."""
     print("\nCompiling model...")
+    # ONNX Runtime (BEST performance)
     if USE_ONNX_RUNTIME:
         try:
             import onnxruntime as ort
             onnx_path = "model_optimized.onnx"
+            # Export if not exists
             if not os.path.exists(onnx_path):
+                print("Exporting model to ONNX format...")
+                dummy_input = torch.randint(0, 100, (1, 64))
                 torch.onnx.export(
+                    model, dummy_input, onnx_path,
                     input_names=['input_ids'],
                     output_names=['logits'],
                     dynamic_axes={'input_ids': {0: 'batch', 1: 'sequence'}},
                     opset_version=16
                 )
+            # Create optimized session
             sess_options = ort.SessionOptions()
             sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+            sess_options.intra_op_num_threads = PHYSICAL_CORES
+            return ort.InferenceSession(onnx_path, sess_options)
         except Exception as e:
+            print(f"⚠ ONNX Runtime failed: {e}")
+    # Intel IPEX
     if USE_IPEX:
         try:
             import intel_extension_for_pytorch as ipex
             model = ipex.optimize(model, dtype=torch.bfloat16, level="O3")
+            print("✓ Intel IPEX optimization applied")
             return model
         except Exception as e:
             print(f"⚠ IPEX failed: {e}")
+    # torch.compile
     if USE_TORCH_COMPILE and hasattr(torch, 'compile'):
         try:
+            model = torch.compile(model, mode="max-autotune")
+            print("✓ torch.compile applied")
         except Exception as e:
+            print(f"⚠ torch.compile failed: {e}")
     return model
 def warmup_model(model, tokenizer, chat_func):
+    """Warmup the model."""
     if WARMUP_ITERATIONS == 0:
         return
     print("\nWarming up model...")
+    start = time.time()
     try:
         with torch.inference_mode():
             for i in range(WARMUP_ITERATIONS):
+                chat_func(
+                    model, tokenizer, "Hello",
+                    steps=4, block_size=16, max_new_tokens=8,
+                    temperature=0.7, top_k=50, top_p=0.9,
                     repetition_penalty=1.2, no_repeat_ngram_size=3,
+                    verbose=False, visualize_fn=None, parallel_blocks=PHYSICAL_CORES
                 )
+                print(f"  Warmup {i+1}/{WARMUP_ITERATIONS}...")
     except Exception as e:
         print(f"⚠ Warmup failed: {e}")
+    print(f"✓ Warmup complete ({time.time() - start:.2f}s)")
 def load_model_internal():
+    """Load model with ultra-fast optimizations."""
+    global model, tokenizer, config, device, DiffusionLLM, chat_function, ModelConfig
     if model is not None:
         return True
     try:
+        print("\n" + "=" * 70)
         print("ULTRA-FAST CPU MODEL LOADING")
+        print("=" * 70)
+        # FIRST: Import modules to get ModelConfig
+        print("\n1. Loading modules...")
         base_path = find_file("infer-base.py")
         if base_path is None:
             raise RuntimeError("Could not find infer-base.py")
         base_mod = try_import_module(base_path, "infer_base")
+        if base_mod is None:
+            raise RuntimeError("Failed to import infer-base.py")
+        # CRITICAL: Register ModelConfig for pickle
+        if hasattr(base_mod, 'ModelConfig'):
+            ModelConfig = base_mod.ModelConfig
+            sys.modules['__main__'].ModelConfig = ModelConfig
+            print("✓ ModelConfig registered for pickle")
+        if not hasattr(base_mod, 'DiffusionLLM'):
             raise RuntimeError("DiffusionLLM class not found")
         DiffusionLLM = base_mod.DiffusionLLM
+        print("✓ DiffusionLLM loaded")
+        # Import chat function
         chat_path = find_file("infer-chat.py")
         if chat_path is None:
             raise RuntimeError("Could not find infer-chat.py")
             raise RuntimeError("Chat function not found")
         chat_function = chat_mod.chat
+        print("✓ Chat function loaded")
         # Device
         device = torch.device("cpu")
         # Load tokenizer
+        print("\n2. Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(
             "Qwen/Qwen2.5-0.5B",
             use_fast=True,
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        print("✓ Fast tokenizer ready")
         # Find model
         checkpoint_dirs = ["checkpoints", "../checkpoints", "./checkpoints"]
         if model_path is None:
             raise RuntimeError("Model checkpoint not found")
+        print(f"\n3. Loading checkpoint: {model_path}")
+        # CRITICAL: Load checkpoint AFTER ModelConfig is registered
         checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
         config = checkpoint['config']
+        # Create model
+        print("4. Building model...")
         model = DiffusionLLM(config)
+        # Load weights
+        state_dict = checkpoint['model_state']
+        if not USE_IPEX:
             state_dict = {k: v.float() for k, v in state_dict.items()}
         model.load_state_dict(state_dict)
         model.eval()
+        model = model.to(device)
         # Apply optimizations
         model = quantize_model(model)
         model = compile_model(model)
         # Warmup
         # Print summary
         num_params = sum(p.numel() for p in model.parameters()) / 1e6
+        framework = "ONNX Runtime" if USE_ONNX_RUNTIME else "IPEX" if USE_IPEX else "PyTorch"
+        precision = "INT8" if QUANTIZE_MODEL and not USE_IPEX else "BF16" if USE_IPEX else "FP32"
+        print("\n" + "=" * 70)
+        print(f"✓✓✓ MODEL LOADED & ULTRA-OPTIMIZED ({framework} + {precision}) ✓✓✓")
+        print("=" * 70)
         print(f"Parameters: {num_params:.1f}M")
+        print(f"CPU Threads: {PHYSICAL_CORES}")
         if 'step' in checkpoint:
             print(f"Training steps: {checkpoint['step']}")
+        if 'best_val_loss' in checkpoint:
+            print(f"Best val loss: {checkpoint['best_val_loss']:.4f}")
+        print("=" * 70 + "\n")
         return True
     except Exception as e:
+        print(f"\n✗ ERROR LOADING MODEL: {e}")
         if __debug__:
             import traceback
             traceback.print_exc()
+        print("=" * 70 + "\n")
         return False
 def create_streaming_visualizer():
             is_masked_list = [is_masked_list]
         try:
             context_text = tok.decode(context_ids[0], skip_special_tokens=True).replace('\n', ' ')
         except Exception:
             context_text = str(context_ids[0].tolist())
             block_tokens = mask_block[0].tolist()
             block_data = []
+            # Efficient batch decoding
             token_ids_to_decode = []
             positions = []
             for i, token_id in enumerate(block_tokens):
                 if not is_masked[0, i]:
                     token_ids_to_decode.append(token_id)
             except Exception:
                 decoded_tokens = [str(int(tid)) for tid in token_ids_to_decode]
+            # Reconstruct
             decoded_idx = 0
             for i, token_id in enumerate(block_tokens):
                 if is_masked[0, i]:
 @app.route('/api/load', methods=['POST'])
 def load_model_endpoint():
     """Load the model."""
+    global model
     data = request.json or {}
     check_only = data.get('check_only', False)
     if check_only:
         return jsonify({
             'loaded': model is not None,
 @app.route('/api/generate', methods=['POST'])
 def generate():
+    """Generate response - ultra-fast path."""
+    global model, tokenizer, chat_function
     if model is None:
         return jsonify({'error': 'Model not loaded'}), 400
     data = request.json
     instruction = data.get('instruction', '')
+    steps = data.get('steps', 16)  # Minimal for speed
     block_size = data.get('block_size', 32)
     max_new_tokens = data.get('max_new_tokens', 64)
+    parallel_blocks = data.get('parallel_blocks', PHYSICAL_CORES)
     if not instruction:
         return jsonify({'error': 'No instruction provided'}), 400
     try:
         with torch.inference_mode():
             raw_output, response = chat_function(
+                model, tokenizer, instruction,
+                steps=steps, block_size=block_size, max_new_tokens=max_new_tokens,
+                temperature=0.7, top_k=50, top_p=0.9,
+                repetition_penalty=1.2, no_repeat_ngram_size=3,
+                verbose=False, visualize_fn=None, parallel_blocks=parallel_blocks,
             )
+        return jsonify({'response': response, 'raw_output': raw_output})
     except Exception as e:
         if __debug__:
 @app.route('/api/generate-stream', methods=['POST'])
 def generate_stream():
+    """Generate with streaming - optimized."""
     global model, tokenizer, chat_function
     if model is None:
         return jsonify({'error': 'Model not loaded'}), 400
     data = request.json
     instruction = data.get('instruction', '')
     steps = data.get('steps', 16)
     block_size = data.get('block_size', 32)
     max_new_tokens = data.get('max_new_tokens', 64)
+    parallel_blocks = data.get('parallel_blocks', PHYSICAL_CORES)
     if not instruction:
         return jsonify({'error': 'No instruction provided'}), 400
     def generate_events():
+        event_queue = queue.Queue(maxsize=50)  # Limited queue
         generation_complete = {'done': False, 'result': None}
         def streaming_visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear=True):
             try:
                 visualizer = create_streaming_visualizer()
                 data = visualizer(tok, context_ids, mask_blocks, is_masked_list, cfg, clear)
+                event_queue.put({'type': 'update', 'data': data}, block=False, timeout=0.1)
             except queue.Full:
+                pass  # Drop frames if too slow
         def run_generation():
             try:
                 generation_complete['done'] = True
                 event_queue.put(None)
+        # Start generation thread
+        gen_thread = threading.Thread(target=run_generation, daemon=True)
         gen_thread.start()
+        yield f"data: {json.dumps({'type': 'start', 'ts': time.time()})}\n\n"
+        # Stream events
         while not generation_complete['done'] or not event_queue.empty():
             try:
                 event = event_queue.get(timeout=0.1)
         gen_thread.join(timeout=2.0)
+        # Send final result
         if generation_complete['result']:
             raw_output, response = generation_complete['result']
+            yield f"data: {json.dumps({'type': 'complete' if raw_output != 'error' else 'error',
+                                      'response': response, 'error': response if raw_output == 'error' else None})}\n\n"
     return Response(
         stream_with_context(generate_events()),
 @app.route('/api/status', methods=['GET'])
 def status():
+    """Get detailed status."""
     return jsonify({
         'model_loaded': model is not None,
+        'cpu_cores': os.cpu_count(),
+        'physical_cores': PHYSICAL_CORES,
         'torch_threads': torch.get_num_threads(),
         'interop_threads': torch.get_num_interop_threads(),
         'optimizations': {
             'onnx_runtime': USE_ONNX_RUNTIME,
             'ipex': USE_IPEX,
             'torch_compile': USE_TORCH_COMPILE,
+            'quantization': QUANTIZE_MODEL,
+            'warmup_iterations': WARMUP_ITERATIONS
         }
     })
 if __name__ == '__main__':
     print("\n" + "=" * 70)
+    print("ULTRA-FAST CPU INFERENCE SERVER v2.0")
     print("=" * 70)
+    print(f"CPU Configuration: {PHYSICAL_CORES} physical cores")
+    print(f"Optimizations: ONNX={USE_ONNX_RUNTIME} | IPEX={USE_IPEX} | Compile={USE_TORCH_COMPILE}")
     print("=" * 70 + "\n")
     app.run(debug=False, host='0.0.0.0', port=7860, threaded=True)