Spaces:

ryandt
/

ocr-entropy

Running on Zero

File size: 11,758 Bytes

"""
OCR Confidence Visualization - Gradio Application.

Upload a document image to extract text with confidence visualization.

Supports deployment to HuggingFace Spaces with ZeroGPU via @spaces.GPU decorator.
The decorator is effect-free in non-ZeroGPU environments for local development.
"""

import html
import json
from typing import Generator

import gradio as gr
from PIL import Image

# Import spaces for ZeroGPU support (effect-free outside HuggingFace Spaces)
try:
    import spaces
    SPACES_AVAILABLE = True
except ImportError:
    SPACES_AVAILABLE = False

from model import generate_with_logprobs, load_model, TokenData, AVAILABLE_MODELS, DEFAULT_MODEL


def gpu_decorator(duration: int = 120):
    """
    Return @spaces.GPU decorator if available, otherwise a no-op decorator.

    This allows the code to work both locally and on HuggingFace Spaces.
    """
    if SPACES_AVAILABLE:
        return spaces.GPU(duration=duration)
    return lambda fn: fn


def probability_to_color(prob: float) -> str:
    """
    Map probability to a color for text and underline styling.

    Args:
        prob: Confidence probability (0.0 to 1.0)

    Returns:
        Hex color string
    """
    if prob > 0.99:
        return "#3b82f6"  # Blue - very high confidence
    elif prob > 0.95:
        return "#16a34a"  # Dark Green - high confidence
    elif prob > 0.85:
        return "#65a30d"  # Darker Light Green - good confidence (darkened for readability)
    elif prob > 0.70:
        return "#ca8a04"  # Darker Yellow - moderate confidence (darkened for readability)
    elif prob > 0.50:
        return "#ef4444"  # Red - low confidence
    else:
        return "#a855f7"  # Purple - very low confidence


def entropy_to_color(entropy: float) -> str:
    """
    Map entropy (in bits) to a color for visualization.

    Higher entropy = more uncertainty = warmer colors.

    Args:
        entropy: Shannon entropy in bits (0.0 = certain)

    Returns:
        Hex color string
    """
    if entropy < 0.1:
        return "#3b82f6"  # Blue - very certain
    elif entropy < 0.3:
        return "#16a34a"  # Dark Green - certain
    elif entropy < 0.7:
        return "#65a30d"  # Green - fairly certain
    elif entropy < 1.5:
        return "#ca8a04"  # Amber - some uncertainty
    elif entropy < 2.5:
        return "#ef4444"  # Red - uncertain
    else:
        return "#a855f7"  # Purple - very uncertain


def build_html_output(tokens: list[TokenData], mode: str = "probability") -> str:
    """
    Build HTML output from accumulated tokens with confidence coloring.

    Args:
        tokens: List of TokenData objects
        mode: "probability" for confidence coloring, "entropy" for uncertainty coloring

    Returns:
        HTML string with styled token spans
    """
    # Font stack with emoji support
    font_family = "'Cascadia Code', 'Fira Code', Consolas, monospace, 'Apple Color Emoji', 'Segoe UI Emoji', 'Noto Color Emoji'"

    # CSS for hover underline effect
    style_tag = '<style>.token-span:hover { text-decoration: underline !important; }</style>'

    if not tokens:
        return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.8; padding: 10px;"></div>'

    spans = []
    for token_data in tokens:
        # Escape HTML entities in token text
        token_text = html.escape(token_data.token)

        # Handle newlines - convert to <br>
        if "\n" in token_text:
            token_text = token_text.replace("\n", "<br>")
            spans.append(token_text)
        else:
            # Get color based on mode
            if mode == "entropy":
                color = entropy_to_color(token_data.entropy)
            else:
                color = probability_to_color(token_data.probability)

            # Encode alternatives as JSON for data attribute
            alternatives_json = html.escape(json.dumps(token_data.alternatives))

            # Build styled span with color (underline on hover via CSS)
            span = (
                f'<span class="token-span" style="color: {color}; '
                f'text-decoration-color: {color}; cursor: pointer;" '
                f'data-prob="{token_data.probability}" '
                f'data-entropy="{token_data.entropy}" '
                f'data-alternatives="{alternatives_json}">'
                f'{token_text}</span>'
            )
            spans.append(span)

    html_content = "".join(spans)
    return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.6; padding: 10px; white-space: pre-wrap;">{html_content}</div>'


@gpu_decorator(duration=120)
def transcribe_full(image: Image.Image, model_name: str = None) -> list[TokenData]:
    """
    Run full OCR inference on GPU and return all tokens.

    On HuggingFace Spaces with ZeroGPU, this function is decorated with
    @spaces.GPU to allocate GPU resources for the duration of inference.
    The GPU is released when the function returns.

    Args:
        image: PIL Image to process
        model_name: Which model to use for inference

    Returns:
        List of TokenData with token strings, probabilities, and alternatives
    """
    return list(generate_with_logprobs(image, model_name=model_name))


def transcribe_streaming(image: Image.Image, model_name: str = None) -> Generator[tuple[str, str], None, None]:
    """
    Stream OCR transcription with progressive HTML output for both views.

    This function separates GPU-bound inference from HTML rendering:
    1. Shows a "Processing..." indicator during inference
    2. Runs full inference in a single GPU-decorated call
    3. Streams HTML rendering from pre-computed tokens (no GPU needed)

    This architecture is required for HuggingFace ZeroGPU, which allocates
    GPU resources per decorated function call rather than for streaming.

    Args:
        image: PIL Image to process
        model_name: Which model to use for inference

    Yields:
        Tuple of (probability_html, entropy_html) as tokens stream
    """
    if image is None:
        empty = '<div style="color: #666; padding: 10px;">Please upload an image.</div>'
        yield empty, empty
        return

    # Show processing indicator during GPU inference
    loading = f'''<div style="color: #60a5fa; padding: 10px; display: flex; align-items: center; gap: 10px;">
        <div style="width: 20px; height: 20px; border: 2px solid #60a5fa; border-top-color: transparent; border-radius: 50%; animation: spin 1s linear infinite;"></div>
        <style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>
        Processing image with {model_name or DEFAULT_MODEL}...
    </div>'''
    yield loading, loading

    # Run full inference (GPU allocated here on ZeroGPU)
    tokens = transcribe_full(image, model_name=model_name)

    # Stream HTML rendering (no GPU needed)
    accumulated: list[TokenData] = []
    for token in tokens:
        accumulated.append(token)
        prob_html = build_html_output(accumulated, mode="probability")
        entropy_html = build_html_output(accumulated, mode="entropy")
        yield prob_html, entropy_html


# JavaScript for token alternatives panel (loaded via launch js parameter)
TOKEN_ALTERNATIVES_JS = """
(function() {
    document.addEventListener('click', function(e) {
        var token = e.target.closest('[data-alternatives]');
        if (!token || !token.dataset.alternatives) return;

        var panel = document.getElementById('alternatives-panel');
        if (!panel) return;

        var prob = parseFloat(token.dataset.prob) || 0;
        var alts = JSON.parse(token.dataset.alternatives);
        var tokenText = token.textContent;

        // Build panel content
        var html = '<div style="font-weight:600;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #374151;">' +
            'Selected: "<span style="color:#60a5fa">' + tokenText + '</span>" (' + (prob * 100).toFixed(2) + '%)' +
            '</div>';

        if (alts.length === 0) {
            html += '<div style="color:#9ca3af;font-style:italic">No alternatives available</div>';
        } else {
            html += '<div style="font-size:12px;color:#9ca3af;margin-bottom:8px;">Top ' + Math.min(alts.length, 10) + ' alternatives:</div>';
            for (var i = 0; i < Math.min(alts.length, 10); i++) {
                var alt = alts[i];
                var altProb = (alt.probability * 100).toFixed(2);
                var barWidth = Math.max(alt.probability * 100, 1);
                html += '<div style="display:flex;align-items:center;margin:6px 0;">' +
                    '<span style="width:80px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;font-family:monospace;">' +
                    alt.token.replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>' +
                    '<span style="width:55px;text-align:right;color:#9ca3af;font-size:12px;margin-right:10px;">' +
                    altProb + '%</span>' +
                    '<div style="flex:1;height:10px;background:#374151;border-radius:5px;overflow:hidden;">' +
                    '<div style="width:' + barWidth + '%;height:100%;background:#60a5fa;border-radius:5px;"></div>' +
                    '</div></div>';
            }
        }

        panel.innerHTML = html;
    });
})();
"""

# Initial HTML for alternatives panel
ALTERNATIVES_PANEL_INITIAL = '''
<div id="alternatives-panel" style="
    padding: 16px;
    background: #1f2937;
    border-radius: 8px;
    color: #e5e7eb;
    font-family: system-ui, -apple-system, sans-serif;
    font-size: 14px;
    min-height: 100px;
">
    <div style="color: #9ca3af; font-style: italic;">
        Click on any token above to see alternative predictions.
    </div>
</div>
'''

# Build Gradio interface
with gr.Blocks(title="OCR Confidence Visualization") as demo:
    gr.Markdown("# OCR Confidence Visualization")
    gr.Markdown("Upload a document image to extract text with token streaming.")

    with gr.Row():
        with gr.Column(scale=1):
            model_selector = gr.Radio(
                choices=list(AVAILABLE_MODELS.keys()),
                value=DEFAULT_MODEL,
                label="Model",
            )
            image_input = gr.Image(type="pil", label="Upload Document")
            submit_btn = gr.Button("Transcribe", variant="primary")

        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("Probability"):
                    output_html_prob = gr.HTML(
                        value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
                    )
                with gr.TabItem("Entropy"):
                    output_html_entropy = gr.HTML(
                        value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
                    )
            gr.Markdown("### Token Alternatives")
            alternatives_html = gr.HTML(
                value=ALTERNATIVES_PANEL_INITIAL,
            )

    submit_btn.click(
        fn=transcribe_streaming,
        inputs=[image_input, model_selector],
        outputs=[output_html_prob, output_html_entropy],
    )


if __name__ == "__main__":
    # Preload model at startup for local development
    # On HuggingFace Spaces with ZeroGPU, model loading happens on first request
    # when GPU is allocated by the @spaces.GPU decorator
    if not SPACES_AVAILABLE:
        print("Preloading model (local development)...")
        load_model()
    else:
        print("ZeroGPU detected - model will load on first inference request")
    print("Starting Gradio server...")
    demo.launch(server_port=7860, js=TOKEN_ALTERNATIVES_JS)