Spaces:

ryandt
/

ocr-entropy

Sleeping

App Files Files Community

ryandt commited on 29 days ago

Commit

b94bee0

verified ·

1 Parent(s): 26079d9

Create model.py

Browse files

Files changed (1) hide show

model.py +333 -0

model.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Model loading and inference for OCR Confidence Visualization.
+Loads Nanonets-OCR2-3B (Qwen2.5-VL fine-tune) and provides
+inference with token-level probability extraction.
+"""
+import math
+from dataclasses import dataclass, field
+from typing import Generator, Optional
+import torch
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+# Available models for selection
+AVAILABLE_MODELS = {
+    "Nanonets-OCR2-3B": "nanonets/Nanonets-OCR2-3B",
+    "olmOCR-7B": "allenai/olmOCR-7B-0725",
+    "Aya-Vision-8B": "CohereLabs/aya-vision-8b",
+}
+DEFAULT_MODEL = "Aya-Vision-8B"
+# Global model and processor (loaded once per model)
+_model = None
+_processor = None
+_device = None
+_current_model_name = None
+@dataclass
+class TokenData:
+    """Data for a single generated token with probability info."""
+    token: str
+    probability: float
+    alternatives: list[dict[str, float]]  # [{"token": str, "probability": float}, ...]
+    entropy: float = field(default=0.0)  # Shannon entropy in bits
+def calculate_entropy(probs: list[float]) -> float:
+    """Calculate Shannon entropy in bits from a probability distribution.
+    Args:
+        probs: List of probabilities (should sum to ~1.0).
+    Returns:
+        Entropy in bits. 0.0 for empty or single-certainty distributions.
+    """
+    entropy = 0.0
+    for p in probs:
+        if p > 0:
+            entropy -= p * math.log2(p)
+    return entropy
+def load_model(model_name: str = None):
+    """Load the OCR model and processor. Reloads if model_name differs from current."""
+    global _model, _processor, _device, _current_model_name
+    if model_name is None:
+        model_name = DEFAULT_MODEL
+    model_id = AVAILABLE_MODELS.get(model_name, AVAILABLE_MODELS[DEFAULT_MODEL])
+    # Return cached model if already loaded
+    if _model is not None and _current_model_name == model_name:
+        return _model, _processor
+    # Unload previous model if switching
+    if _model is not None:
+        print(f"Unloading previous model: {_current_model_name}")
+        del _model
+        del _processor
+        _model = None
+        _processor = None
+        torch.cuda.empty_cache()
+    _device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {_device}")
+    print(f"Loading model: {model_id}...")
+    _processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    _model = AutoModelForImageTextToText.from_pretrained(
+        model_id,
+        attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+        torch_dtype=torch.float16,
+    ).to(_device).eval()
+    _current_model_name = model_name
+    print("Model loaded successfully")
+    return _model, _processor
+def run_ocr(image: Image.Image, prompt: str = None) -> str:
+    """
+    Run OCR on an image and return extracted text.
+    Args:
+        image: PIL Image to process
+        prompt: Optional custom prompt (default: natural reading extraction)
+    Returns:
+        Extracted text from the image
+    """
+    model, processor = load_model()
+    if prompt is None:
+        prompt = "Extract the text from the above document as if you were reading it naturally."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    prompt_full = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[prompt_full],
+        images=[image],
+        return_tensors="pt",
+        padding=True,
+    ).to(_device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=1,
+            top_p=0.9,
+            top_k=50,
+            repetition_penalty=1.1,
+        )
+    # Slice off input tokens
+    generated_ids = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )[0]
+    return output_text
+def generate_with_logprobs(
+    image: Image.Image,
+    prompt: Optional[str] = None,
+    max_new_tokens: int = 1024,
+    top_k: int = 20,
+    top_p: float = 0.9,
+    temperature: float = 1.0,  # Use 1.0 for standard distribution, pick top token (argmax)
+    repetition_penalty: float = 1.1,
+    model_name: str = None,
+) -> Generator[TokenData, None, None]:
+    """
+    Generate OCR text token-by-token with probability information.
+    Yields TokenData for each generated token, enabling streaming display
+    with confidence visualization.
+    Args:
+        image: PIL Image to process
+        prompt: Optional custom prompt (default: natural reading extraction)
+        max_new_tokens: Maximum tokens to generate
+        top_k: Number of top alternatives to include
+        top_p: Nucleus sampling parameter
+        temperature: Sampling temperature (low = more deterministic)
+        repetition_penalty: Penalty for repeating tokens (>1.0 reduces repetition)
+        model_name: Which model to use (from AVAILABLE_MODELS keys)
+    Yields:
+        TokenData with token string, probability, and top-k alternatives
+    """
+    model, processor = load_model(model_name)
+    if prompt is None:
+        prompt = "Extract the text from the above document as if you were reading it naturally."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    prompt_full = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[prompt_full],
+        images=[image],
+        return_tensors="pt",
+        padding=True,
+    ).to(_device)
+    input_ids = inputs.input_ids
+    attention_mask = inputs.attention_mask
+    # Get EOS token ID for stopping - check model config first, then tokenizer
+    eos_token_id = model.config.eos_token_id
+    if eos_token_id is None:
+        eos_token_id = processor.tokenizer.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    elif eos_token_id is None:
+        eos_token_id = []  # No EOS token - will rely on max_new_tokens
+    # Track generated tokens
+    generated_ids = input_ids.clone()
+    # Extract image inputs (pixel_values, image_grid_thw for Qwen2.5-VL)
+    model_inputs = {k: v for k, v in inputs.items() if k not in ("input_ids", "attention_mask")}
+    # Use DynamicCache for proper KV cache management
+    from transformers import DynamicCache
+    past_key_values = DynamicCache()
+    # Track sequence length for cache_position
+    seq_length = input_ids.shape[1]
+    # Track rope_deltas for multimodal RoPE (required for Qwen2.5-VL)
+    # This is computed on the first forward pass and must be passed to subsequent passes
+    rope_deltas = None
+    with torch.no_grad():
+        for step in range(max_new_tokens):
+            # Forward pass
+            if step == 0:
+                # First step: include image data, full sequence
+                cache_position = torch.arange(seq_length, device=_device)
+                outputs = model(
+                    input_ids=generated_ids,
+                    attention_mask=attention_mask,
+                    cache_position=cache_position,
+                    past_key_values=past_key_values,
+                    **model_inputs,
+                    return_dict=True,
+                    use_cache=True,
+                )
+            else:
+                # Subsequent steps: only new token with cache
+                cache_position = torch.tensor([seq_length], device=_device)
+                outputs = model(
+                    input_ids=generated_ids[:, -1:],
+                    attention_mask=attention_mask,
+                    cache_position=cache_position,
+                    past_key_values=past_key_values,
+                    rope_deltas=rope_deltas,  # Pass rope_deltas for correct multimodal position encoding
+                    return_dict=True,
+                    use_cache=True,
+                )
+            past_key_values = outputs.past_key_values
+            # Capture rope_deltas from first pass for multimodal position encoding
+            if step == 0 and hasattr(outputs, 'rope_deltas') and outputs.rope_deltas is not None:
+                rope_deltas = outputs.rope_deltas
+            # Get logits for last token position - convert to float32 to avoid overflow
+            next_token_logits = outputs.logits[:, -1, :].float()
+            # Apply repetition penalty to previously generated tokens
+            if repetition_penalty != 1.0:
+                for prev_token_id in generated_ids[0].tolist():
+                    if next_token_logits[0, prev_token_id] < 0:
+                        next_token_logits[0, prev_token_id] *= repetition_penalty
+                    else:
+                        next_token_logits[0, prev_token_id] /= repetition_penalty
+            # Apply temperature
+            if temperature > 0:
+                next_token_logits = next_token_logits / temperature
+            # Compute probabilities via softmax
+            probs = torch.softmax(next_token_logits, dim=-1)
+            # Get top-k probabilities and indices
+            top_probs, top_indices = torch.topk(probs, k=min(top_k, probs.shape[-1]))
+            top_probs = top_probs[0].cpu().tolist()
+            top_indices = top_indices[0].cpu().tolist()
+            # Sample next token (argmax - we use temperature=1.0 for standard distribution)
+            next_token_id = top_indices[0]
+            next_token_prob = top_probs[0]
+            # Check for EOS
+            if next_token_id in eos_token_id:
+                break
+            # Decode token
+            token_str = processor.decode([next_token_id], skip_special_tokens=False)
+            # Build alternatives list (excluding the selected token)
+            alternatives = []
+            for idx, (alt_idx, alt_prob) in enumerate(zip(top_indices[1:], top_probs[1:])):
+                alt_token = processor.decode([alt_idx], skip_special_tokens=False)
+                alternatives.append({"token": alt_token, "probability": alt_prob})
+            # Calculate entropy from full top-k distribution
+            all_probs = [next_token_prob] + [alt["probability"] for alt in alternatives]
+            token_entropy = calculate_entropy(all_probs)
+            # Yield token data
+            yield TokenData(
+                token=token_str,
+                probability=next_token_prob,
+                alternatives=alternatives,
+                entropy=token_entropy,
+            )
+            # Update for next iteration
+            next_token_tensor = torch.tensor([[next_token_id]], device=_device)
+            generated_ids = torch.cat([generated_ids, next_token_tensor], dim=-1)
+            # Extend attention mask to cover full sequence (required for Qwen VL models)
+            attention_mask = torch.cat(
+                [attention_mask, torch.ones((1, 1), device=_device, dtype=attention_mask.dtype)],
+                dim=-1,
+            )
+            seq_length += 1