Spaces:

ryandt
/

ocr-entropy

Running on Zero

App Files Files Community

ryandt commited on Jan 28

Commit

c6f6682

verified ·

1 Parent(s): b94bee0

Update app.py

Browse files

Files changed (1) hide show

app.py +264 -285

app.py CHANGED Viewed

@@ -1,333 +1,312 @@
 """
-Model loading and inference for OCR Confidence Visualization.
-Loads Nanonets-OCR2-3B (Qwen2.5-VL fine-tune) and provides
-inference with token-level probability extraction.
 """
-import math
-from dataclasses import dataclass, field
-from typing import Generator, Optional
-import torch
 from PIL import Image
-from transformers import AutoModelForImageTextToText, AutoProcessor
-# Available models for selection
-AVAILABLE_MODELS = {
-    "Nanonets-OCR2-3B": "nanonets/Nanonets-OCR2-3B",
-    "olmOCR-7B": "allenai/olmOCR-7B-0725",
-    "Aya-Vision-8B": "CohereLabs/aya-vision-8b",
-}
-DEFAULT_MODEL = "Aya-Vision-8B"
-# Global model and processor (loaded once per model)
-_model = None
-_processor = None
-_device = None
-_current_model_name = None
-@dataclass
-class TokenData:
-    """Data for a single generated token with probability info."""
-    token: str
-    probability: float
-    alternatives: list[dict[str, float]]  # [{"token": str, "probability": float}, ...]
-    entropy: float = field(default=0.0)  # Shannon entropy in bits
-def calculate_entropy(probs: list[float]) -> float:
-    """Calculate Shannon entropy in bits from a probability distribution.
     Args:
-        probs: List of probabilities (should sum to ~1.0).
     Returns:
-        Entropy in bits. 0.0 for empty or single-certainty distributions.
     """
-    entropy = 0.0
-    for p in probs:
-        if p > 0:
-            entropy -= p * math.log2(p)
-    return entropy
-def load_model(model_name: str = None):
-    """Load the OCR model and processor. Reloads if model_name differs from current."""
-    global _model, _processor, _device, _current_model_name
-    if model_name is None:
-        model_name = DEFAULT_MODEL
-    model_id = AVAILABLE_MODELS.get(model_name, AVAILABLE_MODELS[DEFAULT_MODEL])
-    # Return cached model if already loaded
-    if _model is not None and _current_model_name == model_name:
-        return _model, _processor
-    # Unload previous model if switching
-    if _model is not None:
-        print(f"Unloading previous model: {_current_model_name}")
-        del _model
-        del _processor
-        _model = None
-        _processor = None
-        torch.cuda.empty_cache()
-    _device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    print(f"Using device: {_device}")
-    print(f"Loading model: {model_id}...")
-    _processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-    _model = AutoModelForImageTextToText.from_pretrained(
-        model_id,
-        attn_implementation="flash_attention_2",
-        trust_remote_code=True,
-        torch_dtype=torch.float16,
-    ).to(_device).eval()
-    _current_model_name = model_name
-    print("Model loaded successfully")
-    return _model, _processor
-def run_ocr(image: Image.Image, prompt: str = None) -> str:
     """
-    Run OCR on an image and return extracted text.
     Args:
         image: PIL Image to process
-        prompt: Optional custom prompt (default: natural reading extraction)
     Returns:
-        Extracted text from the image
     """
-    model, processor = load_model()
-    if prompt is None:
-        prompt = "Extract the text from the above document as if you were reading it naturally."
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-    prompt_full = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True,
-    ).to(_device)
-    with torch.no_grad():
-        output_ids = model.generate(
-            **inputs,
-            max_new_tokens=1024,
-            do_sample=True,
-            temperature=1,
-            top_p=0.9,
-            top_k=50,
-            repetition_penalty=1.1,
-        )
-    # Slice off input tokens
-    generated_ids = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-    )[0]
-    return output_text
-def generate_with_logprobs(
-    image: Image.Image,
-    prompt: Optional[str] = None,
-    max_new_tokens: int = 1024,
-    top_k: int = 20,
-    top_p: float = 0.9,
-    temperature: float = 1.0,  # Use 1.0 for standard distribution, pick top token (argmax)
-    repetition_penalty: float = 1.1,
-    model_name: str = None,
-) -> Generator[TokenData, None, None]:
     """
-    Generate OCR text token-by-token with probability information.
-    Yields TokenData for each generated token, enabling streaming display
-    with confidence visualization.
     Args:
         image: PIL Image to process
-        prompt: Optional custom prompt (default: natural reading extraction)
-        max_new_tokens: Maximum tokens to generate
-        top_k: Number of top alternatives to include
-        top_p: Nucleus sampling parameter
-        temperature: Sampling temperature (low = more deterministic)
-        repetition_penalty: Penalty for repeating tokens (>1.0 reduces repetition)
-        model_name: Which model to use (from AVAILABLE_MODELS keys)
     Yields:
-        TokenData with token string, probability, and top-k alternatives
     """
-    model, processor = load_model(model_name)
-    if prompt is None:
-        prompt = "Extract the text from the above document as if you were reading it naturally."
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": prompt},
-            ],
         }
-    ]
-    prompt_full = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True,
-    ).to(_device)
-    input_ids = inputs.input_ids
-    attention_mask = inputs.attention_mask
-    # Get EOS token ID for stopping - check model config first, then tokenizer
-    eos_token_id = model.config.eos_token_id
-    if eos_token_id is None:
-        eos_token_id = processor.tokenizer.eos_token_id
-    if isinstance(eos_token_id, int):
-        eos_token_id = [eos_token_id]
-    elif eos_token_id is None:
-        eos_token_id = []  # No EOS token - will rely on max_new_tokens
-    # Track generated tokens
-    generated_ids = input_ids.clone()
-    # Extract image inputs (pixel_values, image_grid_thw for Qwen2.5-VL)
-    model_inputs = {k: v for k, v in inputs.items() if k not in ("input_ids", "attention_mask")}
-    # Use DynamicCache for proper KV cache management
-    from transformers import DynamicCache
-    past_key_values = DynamicCache()
-    # Track sequence length for cache_position
-    seq_length = input_ids.shape[1]
-    # Track rope_deltas for multimodal RoPE (required for Qwen2.5-VL)
-    # This is computed on the first forward pass and must be passed to subsequent passes
-    rope_deltas = None
-    with torch.no_grad():
-        for step in range(max_new_tokens):
-            # Forward pass
-            if step == 0:
-                # First step: include image data, full sequence
-                cache_position = torch.arange(seq_length, device=_device)
-                outputs = model(
-                    input_ids=generated_ids,
-                    attention_mask=attention_mask,
-                    cache_position=cache_position,
-                    past_key_values=past_key_values,
-                    **model_inputs,
-                    return_dict=True,
-                    use_cache=True,
-                )
-            else:
-                # Subsequent steps: only new token with cache
-                cache_position = torch.tensor([seq_length], device=_device)
-                outputs = model(
-                    input_ids=generated_ids[:, -1:],
-                    attention_mask=attention_mask,
-                    cache_position=cache_position,
-                    past_key_values=past_key_values,
-                    rope_deltas=rope_deltas,  # Pass rope_deltas for correct multimodal position encoding
-                    return_dict=True,
-                    use_cache=True,
-                )
-            past_key_values = outputs.past_key_values
-            # Capture rope_deltas from first pass for multimodal position encoding
-            if step == 0 and hasattr(outputs, 'rope_deltas') and outputs.rope_deltas is not None:
-                rope_deltas = outputs.rope_deltas
-            # Get logits for last token position - convert to float32 to avoid overflow
-            next_token_logits = outputs.logits[:, -1, :].float()
-            # Apply repetition penalty to previously generated tokens
-            if repetition_penalty != 1.0:
-                for prev_token_id in generated_ids[0].tolist():
-                    if next_token_logits[0, prev_token_id] < 0:
-                        next_token_logits[0, prev_token_id] *= repetition_penalty
-                    else:
-                        next_token_logits[0, prev_token_id] /= repetition_penalty
-            # Apply temperature
-            if temperature > 0:
-                next_token_logits = next_token_logits / temperature
-            # Compute probabilities via softmax
-            probs = torch.softmax(next_token_logits, dim=-1)
-            # Get top-k probabilities and indices
-            top_probs, top_indices = torch.topk(probs, k=min(top_k, probs.shape[-1]))
-            top_probs = top_probs[0].cpu().tolist()
-            top_indices = top_indices[0].cpu().tolist()
-            # Sample next token (argmax - we use temperature=1.0 for standard distribution)
-            next_token_id = top_indices[0]
-            next_token_prob = top_probs[0]
-            # Check for EOS
-            if next_token_id in eos_token_id:
-                break
-            # Decode token
-            token_str = processor.decode([next_token_id], skip_special_tokens=False)
-            # Build alternatives list (excluding the selected token)
-            alternatives = []
-            for idx, (alt_idx, alt_prob) in enumerate(zip(top_indices[1:], top_probs[1:])):
-                alt_token = processor.decode([alt_idx], skip_special_tokens=False)
-                alternatives.append({"token": alt_token, "probability": alt_prob})
-            # Calculate entropy from full top-k distribution
-            all_probs = [next_token_prob] + [alt["probability"] for alt in alternatives]
-            token_entropy = calculate_entropy(all_probs)
-            # Yield token data
-            yield TokenData(
-                token=token_str,
-                probability=next_token_prob,
-                alternatives=alternatives,
-                entropy=token_entropy,
             )
-            # Update for next iteration
-            next_token_tensor = torch.tensor([[next_token_id]], device=_device)
-            generated_ids = torch.cat([generated_ids, next_token_tensor], dim=-1)
-            # Extend attention mask to cover full sequence (required for Qwen VL models)
-            attention_mask = torch.cat(
-                [attention_mask, torch.ones((1, 1), device=_device, dtype=attention_mask.dtype)],
-                dim=-1,
             )
-            seq_length += 1

 """
+OCR Confidence Visualization - Gradio Application.
+Upload a document image to extract text with confidence visualization.
+Supports deployment to HuggingFace Spaces with ZeroGPU via @spaces.GPU decorator.
+The decorator is effect-free in non-ZeroGPU environments for local development.
 """
+import html
+import json
+from typing import Generator
+import gradio as gr
 from PIL import Image
+# Import spaces for ZeroGPU support (effect-free outside HuggingFace Spaces)
+try:
+    import spaces
+    SPACES_AVAILABLE = True
+except ImportError:
+    SPACES_AVAILABLE = False
+from model import generate_with_logprobs, load_model, TokenData, AVAILABLE_MODELS, DEFAULT_MODEL
+def gpu_decorator(duration: int = 120):
+    """
+    Return @spaces.GPU decorator if available, otherwise a no-op decorator.
+    This allows the code to work both locally and on HuggingFace Spaces.
+    """
+    if SPACES_AVAILABLE:
+        return spaces.GPU(duration=duration)
+    return lambda fn: fn
+def probability_to_color(prob: float) -> str:
+    """
+    Map probability to a color for text and underline styling.
     Args:
+        prob: Confidence probability (0.0 to 1.0)
     Returns:
+        Hex color string
     """
+    if prob > 0.99:
+        return "#3b82f6"  # Blue - very high confidence
+    elif prob > 0.95:
+        return "#16a34a"  # Dark Green - high confidence
+    elif prob > 0.85:
+        return "#65a30d"  # Darker Light Green - good confidence (darkened for readability)
+    elif prob > 0.70:
+        return "#ca8a04"  # Darker Yellow - moderate confidence (darkened for readability)
+    elif prob > 0.50:
+        return "#ef4444"  # Red - low confidence
+    else:
+        return "#a855f7"  # Purple - very low confidence
+def entropy_to_color(entropy: float) -> str:
+    """
+    Map entropy (in bits) to a color for visualization.
+    Higher entropy = more uncertainty = warmer colors.
+    Args:
+        entropy: Shannon entropy in bits (0.0 = certain)
+    Returns:
+        Hex color string
+    """
+    if entropy < 0.1:
+        return "#3b82f6"  # Blue - very certain
+    elif entropy < 0.3:
+        return "#16a34a"  # Dark Green - certain
+    elif entropy < 0.7:
+        return "#65a30d"  # Green - fairly certain
+    elif entropy < 1.5:
+        return "#ca8a04"  # Amber - some uncertainty
+    elif entropy < 2.5:
+        return "#ef4444"  # Red - uncertain
+    else:
+        return "#a855f7"  # Purple - very uncertain
+def build_html_output(tokens: list[TokenData], mode: str = "probability") -> str:
+    """
+    Build HTML output from accumulated tokens with confidence coloring.
+    Args:
+        tokens: List of TokenData objects
+        mode: "probability" for confidence coloring, "entropy" for uncertainty coloring
+    Returns:
+        HTML string with styled token spans
+    """
+    # Font stack with emoji support
+    font_family = "'Cascadia Code', 'Fira Code', Consolas, monospace, 'Apple Color Emoji', 'Segoe UI Emoji', 'Noto Color Emoji'"
+    # CSS for hover underline effect
+    style_tag = '<style>.token-span:hover { text-decoration: underline !important; }</style>'
+    if not tokens:
+        return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.8; padding: 10px;"></div>'
+    spans = []
+    for token_data in tokens:
+        # Escape HTML entities in token text
+        token_text = html.escape(token_data.token)
+        # Handle newlines - convert to <br>
+        if "\n" in token_text:
+            token_text = token_text.replace("\n", "<br>")
+            spans.append(token_text)
+        else:
+            # Get color based on mode
+            if mode == "entropy":
+                color = entropy_to_color(token_data.entropy)
+            else:
+                color = probability_to_color(token_data.probability)
+            # Encode alternatives as JSON for data attribute
+            alternatives_json = html.escape(json.dumps(token_data.alternatives))
+            # Build styled span with color (underline on hover via CSS)
+            span = (
+                f'<span class="token-span" style="color: {color}; '
+                f'text-decoration-color: {color}; cursor: pointer;" '
+                f'data-prob="{token_data.probability}" '
+                f'data-entropy="{token_data.entropy}" '
+                f'data-alternatives="{alternatives_json}">'
+                f'{token_text}</span>'
+            )
+            spans.append(span)
+    html_content = "".join(spans)
+    return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.6; padding: 10px; white-space: pre-wrap;">{html_content}</div>'
+@gpu_decorator(duration=120)
+def transcribe_full(image: Image.Image, model_name: str = None) -> list[TokenData]:
     """
+    Run full OCR inference on GPU and return all tokens.
+    On HuggingFace Spaces with ZeroGPU, this function is decorated with
+    @spaces.GPU to allocate GPU resources for the duration of inference.
+    The GPU is released when the function returns.
     Args:
         image: PIL Image to process
+        model_name: Which model to use for inference
     Returns:
+        List of TokenData with token strings, probabilities, and alternatives
     """
+    return list(generate_with_logprobs(image, model_name=model_name))
+def transcribe_streaming(image: Image.Image, model_name: str = None) -> Generator[tuple[str, str], None, None]:
     """
+    Stream OCR transcription with progressive HTML output for both views.
+    This function separates GPU-bound inference from HTML rendering:
+    1. Shows a "Processing..." indicator during inference
+    2. Runs full inference in a single GPU-decorated call
+    3. Streams HTML rendering from pre-computed tokens (no GPU needed)
+    This architecture is required for HuggingFace ZeroGPU, which allocates
+    GPU resources per decorated function call rather than for streaming.
     Args:
         image: PIL Image to process
+        model_name: Which model to use for inference
     Yields:
+        Tuple of (probability_html, entropy_html) as tokens stream
     """
+    if image is None:
+        empty = '<div style="color: #666; padding: 10px;">Please upload an image.</div>'
+        yield empty, empty
+        return
+    # Show processing indicator during GPU inference
+    loading = f'''<div style="color: #60a5fa; padding: 10px; display: flex; align-items: center; gap: 10px;">
+        <div style="width: 20px; height: 20px; border: 2px solid #60a5fa; border-top-color: transparent; border-radius: 50%; animation: spin 1s linear infinite;"></div>
+        <style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>
+        Processing image with {model_name or DEFAULT_MODEL}...
+    </div>'''
+    yield loading, loading
+    # Run full inference (GPU allocated here on ZeroGPU)
+    tokens = transcribe_full(image, model_name=model_name)
+    # Stream HTML rendering (no GPU needed)
+    accumulated: list[TokenData] = []
+    for token in tokens:
+        accumulated.append(token)
+        prob_html = build_html_output(accumulated, mode="probability")
+        entropy_html = build_html_output(accumulated, mode="entropy")
+        yield prob_html, entropy_html
+# JavaScript for token alternatives panel (loaded via launch js parameter)
+TOKEN_ALTERNATIVES_JS = """
+(function() {
+    document.addEventListener('click', function(e) {
+        var token = e.target.closest('[data-alternatives]');
+        if (!token || !token.dataset.alternatives) return;
+        var panel = document.getElementById('alternatives-panel');
+        if (!panel) return;
+        var prob = parseFloat(token.dataset.prob) || 0;
+        var alts = JSON.parse(token.dataset.alternatives);
+        var tokenText = token.textContent;
+        // Build panel content
+        var html = '<div style="font-weight:600;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #374151;">' +
+            'Selected: "<span style="color:#60a5fa">' + tokenText + '</span>" (' + (prob * 100).toFixed(2) + '%)' +
+            '</div>';
+        if (alts.length === 0) {
+            html += '<div style="color:#9ca3af;font-style:italic">No alternatives available</div>';
+        } else {
+            html += '<div style="font-size:12px;color:#9ca3af;margin-bottom:8px;">Top ' + Math.min(alts.length, 10) + ' alternatives:</div>';
+            for (var i = 0; i < Math.min(alts.length, 10); i++) {
+                var alt = alts[i];
+                var altProb = (alt.probability * 100).toFixed(2);
+                var barWidth = Math.max(alt.probability * 100, 1);
+                html += '<div style="display:flex;align-items:center;margin:6px 0;">' +
+                    '<span style="width:80px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;font-family:monospace;">' +
+                    alt.token.replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>' +
+                    '<span style="width:55px;text-align:right;color:#9ca3af;font-size:12px;margin-right:10px;">' +
+                    altProb + '%</span>' +
+                    '<div style="flex:1;height:10px;background:#374151;border-radius:5px;overflow:hidden;">' +
+                    '<div style="width:' + barWidth + '%;height:100%;background:#60a5fa;border-radius:5px;"></div>' +
+                    '</div></div>';
+            }
         }
+        panel.innerHTML = html;
+    });
+})();
+"""
+# Initial HTML for alternatives panel
+ALTERNATIVES_PANEL_INITIAL = '''
+<div id="alternatives-panel" style="
+    padding: 16px;
+    background: #1f2937;
+    border-radius: 8px;
+    color: #e5e7eb;
+    font-family: system-ui, -apple-system, sans-serif;
+    font-size: 14px;
+    min-height: 100px;
+">
+    <div style="color: #9ca3af; font-style: italic;">
+        Click on any token above to see alternative predictions.
+    </div>
+</div>
+'''
+# Build Gradio interface
+with gr.Blocks(title="OCR Confidence Visualization") as demo:
+    gr.Markdown("# OCR Confidence Visualization")
+    gr.Markdown("Upload a document image to extract text with token streaming.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_selector = gr.Radio(
+                choices=list(AVAILABLE_MODELS.keys()),
+                value=DEFAULT_MODEL,
+                label="Model",
             )
+            image_input = gr.Image(type="pil", label="Upload Document")
+            submit_btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("Probability"):
+                    output_html_prob = gr.HTML(
+                        value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
+                    )
+                with gr.TabItem("Entropy"):
+                    output_html_entropy = gr.HTML(
+                        value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
+                    )
+            gr.Markdown("### Token Alternatives")
+            alternatives_html = gr.HTML(
+                value=ALTERNATIVES_PANEL_INITIAL,
             )
+    submit_btn.click(
+        fn=transcribe_streaming,
+        inputs=[image_input, model_selector],
+        outputs=[output_html_prob, output_html_entropy],
+    )
+if __name__ == "__main__":
+    # Preload model at startup for local development
+    # On HuggingFace Spaces with ZeroGPU, model loading happens on first request
+    # when GPU is allocated by the @spaces.GPU decorator
+    if not SPACES_AVAILABLE:
+        print("Preloading model (local development)...")
+        load_model()
+    else:
+        print("ZeroGPU detected - model will load on first inference request")
+    print("Starting Gradio server...")
+    demo.launch(server_port=7860, js=TOKEN_ALTERNATIVES_JS)