File size: 11,758 Bytes
26079d9
c6f6682
26079d9
c6f6682
 
 
 
26079d9
 
c6f6682
 
 
26079d9
c6f6682
26079d9
 
c6f6682
 
 
 
 
 
26079d9
c6f6682
26079d9
 
c6f6682
 
 
26079d9
c6f6682
 
 
 
 
26079d9
 
c6f6682
 
 
26079d9
 
c6f6682
26079d9
 
c6f6682
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26079d9
c6f6682
26079d9
c6f6682
 
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26079d9
c6f6682
 
 
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26079d9
c6f6682
 
26079d9
 
c6f6682
 
26079d9
c6f6682
 
 
 
 
26079d9
 
 
c6f6682
26079d9
 
c6f6682
26079d9
c6f6682
26079d9
 
c6f6682
26079d9
c6f6682
26079d9
c6f6682
 
 
 
 
 
 
26079d9
 
 
c6f6682
26079d9
 
c6f6682
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26079d9
 
c6f6682
 
 
 
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26079d9
c6f6682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""
OCR Confidence Visualization - Gradio Application.

Upload a document image to extract text with confidence visualization.

Supports deployment to HuggingFace Spaces with ZeroGPU via @spaces.GPU decorator.
The decorator is effect-free in non-ZeroGPU environments for local development.
"""

import html
import json
from typing import Generator

import gradio as gr
from PIL import Image

# Import spaces for ZeroGPU support (effect-free outside HuggingFace Spaces)
try:
    import spaces
    SPACES_AVAILABLE = True
except ImportError:
    SPACES_AVAILABLE = False

from model import generate_with_logprobs, load_model, TokenData, AVAILABLE_MODELS, DEFAULT_MODEL


def gpu_decorator(duration: int = 120):
    """
    Return @spaces.GPU decorator if available, otherwise a no-op decorator.

    This allows the code to work both locally and on HuggingFace Spaces.
    """
    if SPACES_AVAILABLE:
        return spaces.GPU(duration=duration)
    return lambda fn: fn


def probability_to_color(prob: float) -> str:
    """
    Map probability to a color for text and underline styling.

    Args:
        prob: Confidence probability (0.0 to 1.0)

    Returns:
        Hex color string
    """
    if prob > 0.99:
        return "#3b82f6"  # Blue - very high confidence
    elif prob > 0.95:
        return "#16a34a"  # Dark Green - high confidence
    elif prob > 0.85:
        return "#65a30d"  # Darker Light Green - good confidence (darkened for readability)
    elif prob > 0.70:
        return "#ca8a04"  # Darker Yellow - moderate confidence (darkened for readability)
    elif prob > 0.50:
        return "#ef4444"  # Red - low confidence
    else:
        return "#a855f7"  # Purple - very low confidence


def entropy_to_color(entropy: float) -> str:
    """
    Map entropy (in bits) to a color for visualization.

    Higher entropy = more uncertainty = warmer colors.

    Args:
        entropy: Shannon entropy in bits (0.0 = certain)

    Returns:
        Hex color string
    """
    if entropy < 0.1:
        return "#3b82f6"  # Blue - very certain
    elif entropy < 0.3:
        return "#16a34a"  # Dark Green - certain
    elif entropy < 0.7:
        return "#65a30d"  # Green - fairly certain
    elif entropy < 1.5:
        return "#ca8a04"  # Amber - some uncertainty
    elif entropy < 2.5:
        return "#ef4444"  # Red - uncertain
    else:
        return "#a855f7"  # Purple - very uncertain


def build_html_output(tokens: list[TokenData], mode: str = "probability") -> str:
    """
    Build HTML output from accumulated tokens with confidence coloring.

    Args:
        tokens: List of TokenData objects
        mode: "probability" for confidence coloring, "entropy" for uncertainty coloring

    Returns:
        HTML string with styled token spans
    """
    # Font stack with emoji support
    font_family = "'Cascadia Code', 'Fira Code', Consolas, monospace, 'Apple Color Emoji', 'Segoe UI Emoji', 'Noto Color Emoji'"

    # CSS for hover underline effect
    style_tag = '<style>.token-span:hover { text-decoration: underline !important; }</style>'

    if not tokens:
        return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.8; padding: 10px;"></div>'

    spans = []
    for token_data in tokens:
        # Escape HTML entities in token text
        token_text = html.escape(token_data.token)

        # Handle newlines - convert to <br>
        if "\n" in token_text:
            token_text = token_text.replace("\n", "<br>")
            spans.append(token_text)
        else:
            # Get color based on mode
            if mode == "entropy":
                color = entropy_to_color(token_data.entropy)
            else:
                color = probability_to_color(token_data.probability)

            # Encode alternatives as JSON for data attribute
            alternatives_json = html.escape(json.dumps(token_data.alternatives))

            # Build styled span with color (underline on hover via CSS)
            span = (
                f'<span class="token-span" style="color: {color}; '
                f'text-decoration-color: {color}; cursor: pointer;" '
                f'data-prob="{token_data.probability}" '
                f'data-entropy="{token_data.entropy}" '
                f'data-alternatives="{alternatives_json}">'
                f'{token_text}</span>'
            )
            spans.append(span)

    html_content = "".join(spans)
    return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.6; padding: 10px; white-space: pre-wrap;">{html_content}</div>'


@gpu_decorator(duration=120)
def transcribe_full(image: Image.Image, model_name: str = None) -> list[TokenData]:
    """
    Run full OCR inference on GPU and return all tokens.

    On HuggingFace Spaces with ZeroGPU, this function is decorated with
    @spaces.GPU to allocate GPU resources for the duration of inference.
    The GPU is released when the function returns.

    Args:
        image: PIL Image to process
        model_name: Which model to use for inference

    Returns:
        List of TokenData with token strings, probabilities, and alternatives
    """
    return list(generate_with_logprobs(image, model_name=model_name))


def transcribe_streaming(image: Image.Image, model_name: str = None) -> Generator[tuple[str, str], None, None]:
    """
    Stream OCR transcription with progressive HTML output for both views.

    This function separates GPU-bound inference from HTML rendering:
    1. Shows a "Processing..." indicator during inference
    2. Runs full inference in a single GPU-decorated call
    3. Streams HTML rendering from pre-computed tokens (no GPU needed)

    This architecture is required for HuggingFace ZeroGPU, which allocates
    GPU resources per decorated function call rather than for streaming.

    Args:
        image: PIL Image to process
        model_name: Which model to use for inference

    Yields:
        Tuple of (probability_html, entropy_html) as tokens stream
    """
    if image is None:
        empty = '<div style="color: #666; padding: 10px;">Please upload an image.</div>'
        yield empty, empty
        return

    # Show processing indicator during GPU inference
    loading = f'''<div style="color: #60a5fa; padding: 10px; display: flex; align-items: center; gap: 10px;">
        <div style="width: 20px; height: 20px; border: 2px solid #60a5fa; border-top-color: transparent; border-radius: 50%; animation: spin 1s linear infinite;"></div>
        <style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>
        Processing image with {model_name or DEFAULT_MODEL}...
    </div>'''
    yield loading, loading

    # Run full inference (GPU allocated here on ZeroGPU)
    tokens = transcribe_full(image, model_name=model_name)

    # Stream HTML rendering (no GPU needed)
    accumulated: list[TokenData] = []
    for token in tokens:
        accumulated.append(token)
        prob_html = build_html_output(accumulated, mode="probability")
        entropy_html = build_html_output(accumulated, mode="entropy")
        yield prob_html, entropy_html


# JavaScript for token alternatives panel (loaded via launch js parameter)
TOKEN_ALTERNATIVES_JS = """
(function() {
    document.addEventListener('click', function(e) {
        var token = e.target.closest('[data-alternatives]');
        if (!token || !token.dataset.alternatives) return;

        var panel = document.getElementById('alternatives-panel');
        if (!panel) return;

        var prob = parseFloat(token.dataset.prob) || 0;
        var alts = JSON.parse(token.dataset.alternatives);
        var tokenText = token.textContent;

        // Build panel content
        var html = '<div style="font-weight:600;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #374151;">' +
            'Selected: "<span style="color:#60a5fa">' + tokenText + '</span>" (' + (prob * 100).toFixed(2) + '%)' +
            '</div>';

        if (alts.length === 0) {
            html += '<div style="color:#9ca3af;font-style:italic">No alternatives available</div>';
        } else {
            html += '<div style="font-size:12px;color:#9ca3af;margin-bottom:8px;">Top ' + Math.min(alts.length, 10) + ' alternatives:</div>';
            for (var i = 0; i < Math.min(alts.length, 10); i++) {
                var alt = alts[i];
                var altProb = (alt.probability * 100).toFixed(2);
                var barWidth = Math.max(alt.probability * 100, 1);
                html += '<div style="display:flex;align-items:center;margin:6px 0;">' +
                    '<span style="width:80px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;font-family:monospace;">' +
                    alt.token.replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>' +
                    '<span style="width:55px;text-align:right;color:#9ca3af;font-size:12px;margin-right:10px;">' +
                    altProb + '%</span>' +
                    '<div style="flex:1;height:10px;background:#374151;border-radius:5px;overflow:hidden;">' +
                    '<div style="width:' + barWidth + '%;height:100%;background:#60a5fa;border-radius:5px;"></div>' +
                    '</div></div>';
            }
        }

        panel.innerHTML = html;
    });
})();
"""

# Initial HTML for alternatives panel
ALTERNATIVES_PANEL_INITIAL = '''
<div id="alternatives-panel" style="
    padding: 16px;
    background: #1f2937;
    border-radius: 8px;
    color: #e5e7eb;
    font-family: system-ui, -apple-system, sans-serif;
    font-size: 14px;
    min-height: 100px;
">
    <div style="color: #9ca3af; font-style: italic;">
        Click on any token above to see alternative predictions.
    </div>
</div>
'''

# Build Gradio interface
with gr.Blocks(title="OCR Confidence Visualization") as demo:
    gr.Markdown("# OCR Confidence Visualization")
    gr.Markdown("Upload a document image to extract text with token streaming.")

    with gr.Row():
        with gr.Column(scale=1):
            model_selector = gr.Radio(
                choices=list(AVAILABLE_MODELS.keys()),
                value=DEFAULT_MODEL,
                label="Model",
            )
            image_input = gr.Image(type="pil", label="Upload Document")
            submit_btn = gr.Button("Transcribe", variant="primary")

        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("Probability"):
                    output_html_prob = gr.HTML(
                        value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
                    )
                with gr.TabItem("Entropy"):
                    output_html_entropy = gr.HTML(
                        value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
                    )
            gr.Markdown("### Token Alternatives")
            alternatives_html = gr.HTML(
                value=ALTERNATIVES_PANEL_INITIAL,
            )

    submit_btn.click(
        fn=transcribe_streaming,
        inputs=[image_input, model_selector],
        outputs=[output_html_prob, output_html_entropy],
    )


if __name__ == "__main__":
    # Preload model at startup for local development
    # On HuggingFace Spaces with ZeroGPU, model loading happens on first request
    # when GPU is allocated by the @spaces.GPU decorator
    if not SPACES_AVAILABLE:
        print("Preloading model (local development)...")
        load_model()
    else:
        print("ZeroGPU detected - model will load on first inference request")
    print("Starting Gradio server...")
    demo.launch(server_port=7860, js=TOKEN_ALTERNATIVES_JS)