ningpp
/

GLM-OCR

 Flux is a Java-based OCR
+## Attention
+**If you download model before 2026-03-07, you can download model again, current  version of the model has better inference performance.**
+## ONNX Inference
+```
+"""
+End-to-end ONNX inference for GLM-OCR model.
+This script performs complete inference using exported ONNX models:
+1. Vision encoder (processes images)
+2. Embedding layer (converts token IDs to embeddings)
+3. Prefill model (processes prompt)
+4. Decode model (generates tokens autoregressively)
+Usage:
+    python onnx_inference_e2e.py --image <path> --max-tokens 100
+    python onnx_inference_e2e.py --use-real-images --max-tokens 100
+"""
+import os
+import sys
+import time
+import argparse
+from typing import List, Tuple, Optional
+from PIL import Image
+import numpy as np
+import onnxruntime as ort
+from transformers import AutoProcessor, AutoModelForImageTextToText, AutoConfig
+class GLMOcrOnnxInference:
+    """End-to-end ONNX inference for GLM-OCR."""
+    def __init__(self, onnx_dir: str, device: str = "cpu"):
+        """
+        Initialize ONNX inference sessions.
+        Args:
+            onnx_dir: Directory containing exported ONNX models
+            device: "cpu" or "cuda"
+        """
+        self.onnx_dir = onnx_dir
+        self.device = device
+        self.providers = ["CUDAExecutionProvider"] if device == "cuda" else ["CPUExecutionProvider"]
+        # Load processor for tokenization
+        print(f"Loading processor from {onnx_dir}...")
+        self.processor = AutoProcessor.from_pretrained(onnx_dir, trust_remote_code=True)
+        # Model config
+        self.config = self._load_config()
+        # Create ONNX sessions
+        self.sessions = self._create_sessions()
+    def _load_config(self):
+        """Load model configuration without loading the entire model."""
+        # Load config directly instead of the entire model
+        config = AutoConfig.from_pretrained(self.onnx_dir, trust_remote_code=True)
+        return config
+    def _create_sessions(self) -> dict:
+        """Create ONNX Runtime sessions for all models."""
+        print("Creating ONNX Runtime sessions...")
+        opts = ort.SessionOptions()
+        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        if self.device == "cuda":
+            # CUDA-specific optimizations
+            opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+            opts.enable_mem_pattern = True
+            opts.enable_mem_reuse = True
+        else:
+            # CPU optimizations
+            opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+            import multiprocessing
+            num_cores = multiprocessing.cpu_count()
+            opts.intra_op_num_threads = num_cores
+            opts.inter_op_num_threads = 1
+        sessions = {}
+        # Get available providers and set up CUDA options
+        if self.device == "cuda":
+            available_providers = ort.get_available_providers()
+            providers = []
+            # Try TensorRT first if available (best performance)
+            if "TensorrtExecutionProvider" in available_providers:
+                print("  TensorRT is available but disabled temporarily due to shape inference requirements")
+                # Commented out until we run shape inference on the model
+                # providers.append(("TensorrtExecutionProvider", {
+                #     "trt_engine_cache_enable": True,
+                #     "trt_engine_cache_path": "./trt_cache",
+                #     "trt_fp16_enable": True,
+                # }))
+                # print("  Using TensorRT Execution Provider")
+            # Always add CUDAExecutionProvider
+            providers.append(("CUDAExecutionProvider", {
+                "device_id": 0,
+                "arena_extend_strategy": "kNextPowerOfTwo",
+                "cudnn_conv_algo_search": "EXHAUSTIVE",
+                "do_copy_in_default_stream": True,
+            }))
+            # Fallback to CPU
+            providers.append("CPUExecutionProvider")
+        else:
+            providers = self.providers
+        # Vision encoder
+        vision_path = os.path.join(self.onnx_dir, "vision_encoder_fused.onnx")
+        if os.path.exists(vision_path):
+            sessions["vision"] = ort.InferenceSession(
+                vision_path, opts, providers=providers
+            )
+            print(f"  ✓ Vision encoder loaded")
+        # Embedding layer
+        embedding_path = os.path.join(self.onnx_dir, "embedding.onnx")
+        if os.path.exists(embedding_path):
+            sessions["embedding"] = ort.InferenceSession(
+                embedding_path, opts, providers=providers
+            )
+            print(f"  ✓ Embedding layer loaded")
+        # Prefill model
+        prefill_path = os.path.join(self.onnx_dir, "llm_prefill.onnx")
+        if os.path.exists(prefill_path):
+            sessions["prefill"] = ort.InferenceSession(
+                prefill_path, opts, providers=providers
+            )
+            print(f"  ✓ Prefill model loaded")
+        # Decode model
+        decode_path = os.path.join(self.onnx_dir, "llm_decode.onnx")
+        if os.path.exists(decode_path):
+            sessions["decode"] = ort.InferenceSession(
+                decode_path, opts, providers=providers
+            )
+            print(f"  ✓ Decode model loaded")
+        return sessions
+    def encode_image(self, image_path: str) -> np.ndarray:
+        """
+        Encode image using vision encoder.
+        Args:
+            image_path: Path to image file
+        Returns:
+            Image features as numpy array
+        """
+        if "vision" not in self.sessions:
+            raise RuntimeError("Vision encoder not available")
+        # Load and preprocess image
+        image = Image.open(image_path).convert("RGB")
+        # Use full processor to get all necessary inputs (pixel_values, grid_thw)
+        messages = [{'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': 'test'}]}]
+        text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = self.processor(text=text, images=[image], return_tensors='pt')
+        pixel_values = inputs.pixel_values
+        grid_thw = inputs.image_grid_thw
+        # Compute pos_ids and max_grid_size
+        pos_ids, max_grid_size = self._compute_pos_ids(grid_thw)
+        # Convert to numpy arrays
+        pixel_values_np = pixel_values.numpy()
+        pos_ids_np = pos_ids.numpy()
+        max_grid_size_np = np.array(max_grid_size, dtype=np.int64)
+        # Run vision encoder
+        outputs = self.sessions["vision"].run(None, {
+            "pixel_values": pixel_values_np,
+            "pos_ids": pos_ids_np,
+            "max_grid_size": max_grid_size_np
+        })
+        return outputs[0]  # image_features
+    def _compute_pos_ids(self, grid_thw, spatial_merge_size: int = 2):
+        """
+        Pre-compute position IDs for rotary embeddings.
+        Args:
+            grid_thw: [batch_size, 3] - (temporal, height_patches, width_patches) for each image
+            spatial_merge_size: The spatial merge factor (default 2)
+        Returns:
+            pos_ids: [total_patches, 2] - position indices for all patches
+            max_grid_size: int - maximum grid dimension
+        """
+        import torch
+        pos_ids_list = []
+        for t, h, w in grid_thw:
+            t, h, w = int(t), int(h), int(w)
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // spatial_merge_size,
+                spatial_merge_size,
+                w // spatial_merge_size,
+                spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // spatial_merge_size,
+                spatial_merge_size,
+                w // spatial_merge_size,
+                spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
+            pos_ids_list.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids_list, dim=0)
+        max_grid_size = int(grid_thw[:, 1:].max())
+        return pos_ids, max_grid_size
+    def _get_rope_index(self, input_ids_list, image_grid_thw, attention_mask_list=None):
+        """
+        Calculate position_ids for M-RoPE (same logic as PyTorch's get_rope_index).
+        Args:
+            input_ids_list: List of input token IDs
+            image_grid_thw: Tensor of [t, h, w] for image grid
+            attention_mask_list: List of attention mask values
+        Returns:
+            position_ids: numpy array of shape [3, seq_len]
+            rope_deltas: int, the delta for decode position calculation
+        """
+        import itertools
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        # Get image grid dimensions
+        t, h, w = image_grid_thw[0][0].item(), image_grid_thw[0][1].item(), image_grid_thw[0][2].item()
+        llm_grid_t = t
+        llm_grid_h = h // spatial_merge_size
+        llm_grid_w = w // spatial_merge_size
+        # Find image token positions
+        boi_token_id = 59256  #
+        eoi_token_id = 59257  #
+        # Build position_ids
+        seq_len = len(input_ids_list)
+        position_ids = np.zeros((3, seq_len), dtype=np.int64)
+        # Find BOI and EOI positions
+        boi_pos = None
+        eoi_pos = None
+        for i, tid in enumerate(input_ids_list):
+            if tid == boi_token_id:
+                boi_pos = i
+            elif tid == eoi_token_id:
+                eoi_pos = i
+        if boi_pos is None or eoi_pos is None:
+            # No image tokens, use simple position_ids
+            for i in range(seq_len):
+                position_ids[0, i] = i
+                position_ids[1, i] = i
+                position_ids[2, i] = i
+            return position_ids, 0
+        # Text tokens before image
+        for i in range(boi_pos):
+            position_ids[0, i] = i
+            position_ids[1, i] = i
+            position_ids[2, i] = i
+        # BOI token
+        st_idx = boi_pos
+        position_ids[0, boi_pos] = st_idx
+        position_ids[1, boi_pos] = st_idx
+        position_ids[2, boi_pos] = st_idx
+        # Image tokens - use 3D position encoding
+        # t_index, h_index, w_index for each image token
+        img_start = boi_pos + 1
+        img_end = eoi_pos
+        for idx, pos in enumerate(range(img_start, img_end)):
+            t_idx = idx // (llm_grid_h * llm_grid_w)
+            hw_idx = idx % (llm_grid_h * llm_grid_w)
+            h_idx = hw_idx // llm_grid_w
+            w_idx = hw_idx % llm_grid_w
+            position_ids[0, pos] = st_idx + t_idx
+            position_ids[1, pos] = st_idx + h_idx
+            position_ids[2, pos] = st_idx + w_idx
+        # EOI token and text after
+        max_img_pos = max(
+            position_ids[0, img_start:img_end].max(),
+            position_ids[1, img_start:img_end].max(),
+            position_ids[2, img_start:img_end].max()
+        )
+        for i, pos in enumerate(range(eoi_pos, seq_len)):
+            position_ids[0, pos] = max_img_pos + 1 + i
+            position_ids[1, pos] = max_img_pos + 1 + i
+            position_ids[2, pos] = max_img_pos + 1 + i
+        # Calculate rope_deltas
+        max_pos = max(
+            position_ids[0].max(),
+            position_ids[1].max(),
+            position_ids[2].max()
+        )
+        rope_deltas = max_pos + 1 - seq_len
+        return position_ids, rope_deltas
+    def _run_with_io_binding(self, session, inputs_dict, device="cuda"):
+        """
+        Run inference (IO Binding temporarily disabled to ensure correct outputs).
+        Args:
+            session: ONNX Runtime InferenceSession
+            inputs_dict: Dictionary of input name -> numpy array
+            device: "cuda" or "cpu"
+        Returns:
+            list of numpy arrays
+        """
+        # Disable IO Binding temporarily to avoid garbage outputs
+        return session.run(None, inputs_dict)
+    def generate(
+        self,
+        image_path: str,
+        prompt: str = "",
+        max_new_tokens: int = 100,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+    ) -> str:
+        """
+        Generate text from image.
+        Args:
+            image_path: Path to input image
+            prompt: Optional text prompt
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+        Returns:
+            Generated text
+        """
+        print(f"\nGenerating for image: {image_path}")
+        print(f"  Prompt: '{prompt}'")
+        print(f"  Max tokens: {max_new_tokens}")
+        print(f"  Device: {self.device}")
+        # Step 1: Encode image
+        print("\n[1/4] Encoding image...")
+        start_time = time.time()
+        image_features = self.encode_image(image_path)
+        print(f"  Image features shape: {image_features.shape}")
+        print(f"  Time: {time.time() - start_time:.2f}s")
+        # Step 2: Prepare input
+        print("\n[2/4] Preparing input...")
+        start_time = time.time()
+        # Load image for processor
+        image = Image.open(image_path).convert("RGB")
+        # Create messages for GLM-OCR chat template (same as transformers_infer.py)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": image_path},
+                    {"type": "text", "text": prompt if prompt else "Describe this image."}
+                ]
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        )
+        inputs.pop("token_type_ids", None)
+        input_ids = inputs["input_ids"].numpy()
+        attention_mask = inputs["attention_mask"].numpy()
+        print(f"  Input IDs shape: {input_ids.shape}")
+        print(f"  Time: {time.time() - start_time:.2f}s")
+        # Step 3: Embedding
+        print("\n[3/4] Getting embeddings...")
+        start_time = time.time()
+        image_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|image|>")
+        input_ids_list = input_ids[0].tolist()
+        # Get embeddings
+        embed_outputs = self._run_with_io_binding(
+            self.sessions["embedding"],
+            {"input_ids": input_ids},
+            device=self.device
+        )
+        inputs_embeds = embed_outputs[0]
+        # Replace image token embeddings with actual image features
+        image_positions = [i for i, tid in enumerate(input_ids_list) if tid == image_token_id]
+        if len(image_positions) > 0:
+            num_image_tokens = image_features.shape[0]
+            if len(image_positions) == num_image_tokens:
+                for i, pos in enumerate(image_positions):
+                    inputs_embeds[0, pos] = image_features[i]
+                print(f"  Replaced {num_image_tokens} image tokens")
+            else:
+                # Remove original <|image|> tokens from input_ids and get embeddings
+                non_image_mask = np.array([tid != image_token_id for tid in input_ids_list])
+                inputs_embeds = inputs_embeds[:, non_image_mask, :]
+                # Also update attention_mask to remove original image token
+                attention_mask = attention_mask[:, non_image_mask]
+                boi_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|begin_of_image|>")
+                if boi_token_id in input_ids_list:
+                    boi_pos = input_ids_list.index(boi_token_id)
+                    before = inputs_embeds[:, :boi_pos+1, :]
+                    after = inputs_embeds[:, boi_pos+1:, :]
+                    image_features_batch = image_features[np.newaxis, :, :]
+                    inputs_embeds = np.concatenate([before, image_features_batch, after], axis=1)
+                    before_mask = attention_mask[:, :boi_pos+1]
+                    image_mask = np.ones((1, num_image_tokens), dtype=np.int64)
+                    after_mask = attention_mask[:, boi_pos+1:]
+                    attention_mask = np.concatenate([before_mask, image_mask, after_mask], axis=1)
+                    print(f"  Inserted {num_image_tokens} image tokens")
+        print(f"  Embeddings shape: {inputs_embeds.shape}")
+        print(f"  Time: {time.time() - start_time:.2f}s")
+        # Step 4: Prefill
+        print("\n[4/4] Running inference...")
+        start_time = time.time()
+        seq_len = inputs_embeds.shape[1]
+        # M-RoPE: Calculate position_ids with proper 3D positions for image tokens
+        # We need to use the same logic as PyTorch's get_rope_index
+        image_grid_thw = inputs.get("image_grid_thw")
+        if image_grid_thw is not None:
+            # Calculate position_ids using the same logic as PyTorch
+            position_ids, rope_deltas = self._get_rope_index(
+                input_ids[0].tolist(),
+                image_grid_thw,
+                attention_mask[0].tolist()
+            )
+            position_ids = position_ids[:, np.newaxis, :]
+            print(f"  M-RoPE enabled: rope_deltas={rope_deltas}")
+        else:
+            # Fallback to simple position_ids
+            position_ids = np.arange(seq_len, dtype=np.int64)
+            position_ids = np.stack([position_ids, position_ids, position_ids], axis=0)
+            position_ids = position_ids[:, np.newaxis, :]
+            rope_deltas = 0
+        prefill_inputs = {
+            "inputs_embeds": inputs_embeds.astype(np.float32),
+            "attention_mask": attention_mask.astype(np.int64),
+            "position_ids": position_ids.astype(np.int64),
+        }
+        prefill_outputs = self._run_with_io_binding(
+            self.sessions["prefill"],
+            prefill_inputs,
+            device=self.device
+        )
+        logits = prefill_outputs[0]
+        past_key_values = prefill_outputs[1:]
+        print(f"  Prefill logits shape: {logits.shape}")
+        print(f"  KV cache tensors: {len(past_key_values)}")
+        print(f"  Time: {time.time() - start_time:.2f}s")
+        print(f"\n[5/5] Generating tokens...", flush=True)
+        print(f"  DEBUG: seq_len={seq_len}, prefill positions=[0..{seq_len-1}]")
+        generated_tokens = []
+        decode_attention_mask = attention_mask.copy()
+        for step in range(max_new_tokens):
+            next_token_logits = logits[:, -1, :]
+            next_token_id = int(np.argmax(next_token_logits, axis=-1)[0])
+            generated_tokens.append(next_token_id)
+            if step < 5:
+                print(f"  DEBUG step={step}: token={next_token_id} ('{self.processor.tokenizer.decode([next_token_id])}')")
+            if next_token_id in [self.processor.tokenizer.eos_token_id, 59253]:
+                print(f"  EOS token reached at step {step + 1}")
+                break
+            # Update attention mask BEFORE decode (to match PyTorch behavior)
+            decode_attention_mask = np.concatenate(
+                [decode_attention_mask, np.ones((1, 1), dtype=np.int64)], axis=1
+            )
+            # Get next token embedding
+            next_token_embeds = self._run_with_io_binding(
+                self.sessions["embedding"],
+                {"input_ids": np.array([[next_token_id]], dtype=np.int64)},
+                device=self.device
+            )[0]
+            # Position IDs for M-RoPE: position = cache_position + rope_deltas
+            # This ensures correct position encoding after image tokens
+            cache_position = seq_len + step
+            new_position = cache_position + rope_deltas
+            decode_position_ids = np.full((3, 1, 1), new_position, dtype=np.int64)
+            if step < 5:
+                print(f"  DEBUG step={step}: cache_pos={cache_position}, rope_delta={rope_deltas}, position_id={new_position}")
+            # Prepare decode inputs
+            decode_inputs = {
+                "inputs_embeds": next_token_embeds.astype(np.float32),
+                "attention_mask": decode_attention_mask,
+                "position_ids": decode_position_ids,
+            }
+            for layer_idx in range(16):
+                decode_inputs[f"past_key_{layer_idx}"] = past_key_values[layer_idx * 2]
+                decode_inputs[f"past_value_{layer_idx}"] = past_key_values[layer_idx * 2 + 1]
+            # Run decode
+            decode_outputs = self._run_with_io_binding(
+                self.sessions["decode"],
+                decode_inputs,
+                device=self.device
+            )
+            logits = decode_outputs[0]
+            past_key_values = decode_outputs[1:]
+            if (step + 1) % 10 == 0:
+                print(f"  Generated {step + 1} tokens...")
+        print(f"\n  Total tokens generated: {len(generated_tokens)}")
+        print(f"  Time: {time.time() - start_time:.2f}s")
+        # Save full token sequence (input + generated) to file for comparison
+        # Note: input_ids_list contains the original 237 tokens from processor
+        # The actual tokens fed to prefill model may differ due to image token handling
+        full_sequence = input_ids_list + generated_tokens
+        with open("result_token_ids_onnx.txt", "w", encoding="utf-8") as f:
+            f.write(f"ONNX Full Token IDs (including input)\n")
+            f.write(f"Total: {len(full_sequence)} tokens\n")
+            f.write(f"Input length: {len(input_ids_list)} tokens (from processor)\n")
+            f.write(f"Prefill seq_len: {seq_len} tokens (actual embeddings fed to model)\n")
+            f.write(f"Generated: {len(generated_tokens)} tokens\n")
+            f.write("="*80 + "\n\n")
+            f.write(f"Full sequence:\n")
+            f.write(f"{full_sequence}\n\n")
+            f.write(f"Input part (first {len(input_ids_list)}):\n")
+            f.write(f"{input_ids_list}\n\n")
+            f.write(f"Generated part (last {len(generated_tokens)}):\n")
+            f.write(f"{generated_tokens}\n")
+        print(f"  Full token IDs saved to result_token_ids_onnx.txt")
+        generated_text = self.processor.tokenizer.decode(
+            generated_tokens, skip_special_tokens=True
+        )
+        return generated_text
+    def _remove_duplicate_branches(self, text: str) -> str:
+        """
+        Remove duplicate branches from LaTeX formula output.
+        This fixes the issue where ONNX model generates repeated formula branches.
+        """
+        import re
+        # Split by line breaks (\\ in LaTeX)
+        lines = text.split('\\\\')
+        seen = set()
+        unique_lines = []
+        for line in lines:
+            # Normalize for comparison (remove extra spaces)
+            normalized = re.sub(r'\s+', ' ', line.strip())
+            if not normalized or normalized not in seen:
+                if normalized:
+                    seen.add(normalized)
+                unique_lines.append(line)
+        return '\\\\'.join(unique_lines)
+    def generate_batch(
+        self,
+        image_paths: List[str],
+        prompt: str = "",
+        max_new_tokens: int = 100,
+    ) -> List[str]:
+        """
+        Generate text for multiple images.
+        Args:
+            image_paths: List of image paths
+            prompt: Optional text prompt
+            max_new_tokens: Maximum number of tokens to generate
+        Returns:
+            List of generated texts
+        """
+        results = []
+        for image_path in image_paths:
+            text = self.generate(image_path, prompt, max_new_tokens)
+            results.append(text)
+        return results
+def main():
+    parser = argparse.ArgumentParser(description="GLM-OCR ONNX End-to-End Inference")
+    parser.add_argument(
+        "--onnx-dir",
+        type=str,
+        default=r"D:\models\onnx-v5\GLM-OCR",
+        help="ONNX models directory",
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        default=None,
+        help="Single image path",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Formula Recognition:",
+        help="Text prompt",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=1024,
+        help="Maximum tokens to generate",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        choices=["cpu", "cuda"],
+        help="Device to use",
+    )
+    args = parser.parse_args()
+    # Get image paths
+    if args.image:
+        image_paths = [args.image]
+    else:
+        print("Error: --image must be specified")
+        sys.exit(1)
+    # Initialize inference
+    inference = GLMOcrOnnxInference(
+        onnx_dir=args.onnx_dir,
+        device=args.device,
+    )
+    # Generate
+    print("\n" + "=" * 60)
+    print("GLM-OCR ONNX End-to-End Inference")
+    print("=" * 60)
+    results = inference.generate_batch(
+        image_paths=image_paths,
+        prompt=args.prompt,
+        max_new_tokens=args.max_tokens,
+    )
+    # Print results
+    print("\n" + "=" * 60)
+    print("Results")
+    print("=" * 60)
+    for i, (image_path, text) in enumerate(zip(image_paths, results)):
+        print(f"\nImage {i + 1}: {image_path}")
+        print(f"Generated text:\n{text}")
+        print("-" * 60)
+if __name__ == "__main__":
+    main()
+```
 # GLM-OCR