Spaces:
Running on Zero
Running on Zero
File size: 11,758 Bytes
26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 26079d9 c6f6682 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | """
OCR Confidence Visualization - Gradio Application.
Upload a document image to extract text with confidence visualization.
Supports deployment to HuggingFace Spaces with ZeroGPU via @spaces.GPU decorator.
The decorator is effect-free in non-ZeroGPU environments for local development.
"""
import html
import json
from typing import Generator
import gradio as gr
from PIL import Image
# Import spaces for ZeroGPU support (effect-free outside HuggingFace Spaces)
try:
import spaces
SPACES_AVAILABLE = True
except ImportError:
SPACES_AVAILABLE = False
from model import generate_with_logprobs, load_model, TokenData, AVAILABLE_MODELS, DEFAULT_MODEL
def gpu_decorator(duration: int = 120):
"""
Return @spaces.GPU decorator if available, otherwise a no-op decorator.
This allows the code to work both locally and on HuggingFace Spaces.
"""
if SPACES_AVAILABLE:
return spaces.GPU(duration=duration)
return lambda fn: fn
def probability_to_color(prob: float) -> str:
"""
Map probability to a color for text and underline styling.
Args:
prob: Confidence probability (0.0 to 1.0)
Returns:
Hex color string
"""
if prob > 0.99:
return "#3b82f6" # Blue - very high confidence
elif prob > 0.95:
return "#16a34a" # Dark Green - high confidence
elif prob > 0.85:
return "#65a30d" # Darker Light Green - good confidence (darkened for readability)
elif prob > 0.70:
return "#ca8a04" # Darker Yellow - moderate confidence (darkened for readability)
elif prob > 0.50:
return "#ef4444" # Red - low confidence
else:
return "#a855f7" # Purple - very low confidence
def entropy_to_color(entropy: float) -> str:
"""
Map entropy (in bits) to a color for visualization.
Higher entropy = more uncertainty = warmer colors.
Args:
entropy: Shannon entropy in bits (0.0 = certain)
Returns:
Hex color string
"""
if entropy < 0.1:
return "#3b82f6" # Blue - very certain
elif entropy < 0.3:
return "#16a34a" # Dark Green - certain
elif entropy < 0.7:
return "#65a30d" # Green - fairly certain
elif entropy < 1.5:
return "#ca8a04" # Amber - some uncertainty
elif entropy < 2.5:
return "#ef4444" # Red - uncertain
else:
return "#a855f7" # Purple - very uncertain
def build_html_output(tokens: list[TokenData], mode: str = "probability") -> str:
"""
Build HTML output from accumulated tokens with confidence coloring.
Args:
tokens: List of TokenData objects
mode: "probability" for confidence coloring, "entropy" for uncertainty coloring
Returns:
HTML string with styled token spans
"""
# Font stack with emoji support
font_family = "'Cascadia Code', 'Fira Code', Consolas, monospace, 'Apple Color Emoji', 'Segoe UI Emoji', 'Noto Color Emoji'"
# CSS for hover underline effect
style_tag = '<style>.token-span:hover { text-decoration: underline !important; }</style>'
if not tokens:
return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.8; padding: 10px;"></div>'
spans = []
for token_data in tokens:
# Escape HTML entities in token text
token_text = html.escape(token_data.token)
# Handle newlines - convert to <br>
if "\n" in token_text:
token_text = token_text.replace("\n", "<br>")
spans.append(token_text)
else:
# Get color based on mode
if mode == "entropy":
color = entropy_to_color(token_data.entropy)
else:
color = probability_to_color(token_data.probability)
# Encode alternatives as JSON for data attribute
alternatives_json = html.escape(json.dumps(token_data.alternatives))
# Build styled span with color (underline on hover via CSS)
span = (
f'<span class="token-span" style="color: {color}; '
f'text-decoration-color: {color}; cursor: pointer;" '
f'data-prob="{token_data.probability}" '
f'data-entropy="{token_data.entropy}" '
f'data-alternatives="{alternatives_json}">'
f'{token_text}</span>'
)
spans.append(span)
html_content = "".join(spans)
return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.6; padding: 10px; white-space: pre-wrap;">{html_content}</div>'
@gpu_decorator(duration=120)
def transcribe_full(image: Image.Image, model_name: str = None) -> list[TokenData]:
"""
Run full OCR inference on GPU and return all tokens.
On HuggingFace Spaces with ZeroGPU, this function is decorated with
@spaces.GPU to allocate GPU resources for the duration of inference.
The GPU is released when the function returns.
Args:
image: PIL Image to process
model_name: Which model to use for inference
Returns:
List of TokenData with token strings, probabilities, and alternatives
"""
return list(generate_with_logprobs(image, model_name=model_name))
def transcribe_streaming(image: Image.Image, model_name: str = None) -> Generator[tuple[str, str], None, None]:
"""
Stream OCR transcription with progressive HTML output for both views.
This function separates GPU-bound inference from HTML rendering:
1. Shows a "Processing..." indicator during inference
2. Runs full inference in a single GPU-decorated call
3. Streams HTML rendering from pre-computed tokens (no GPU needed)
This architecture is required for HuggingFace ZeroGPU, which allocates
GPU resources per decorated function call rather than for streaming.
Args:
image: PIL Image to process
model_name: Which model to use for inference
Yields:
Tuple of (probability_html, entropy_html) as tokens stream
"""
if image is None:
empty = '<div style="color: #666; padding: 10px;">Please upload an image.</div>'
yield empty, empty
return
# Show processing indicator during GPU inference
loading = f'''<div style="color: #60a5fa; padding: 10px; display: flex; align-items: center; gap: 10px;">
<div style="width: 20px; height: 20px; border: 2px solid #60a5fa; border-top-color: transparent; border-radius: 50%; animation: spin 1s linear infinite;"></div>
<style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>
Processing image with {model_name or DEFAULT_MODEL}...
</div>'''
yield loading, loading
# Run full inference (GPU allocated here on ZeroGPU)
tokens = transcribe_full(image, model_name=model_name)
# Stream HTML rendering (no GPU needed)
accumulated: list[TokenData] = []
for token in tokens:
accumulated.append(token)
prob_html = build_html_output(accumulated, mode="probability")
entropy_html = build_html_output(accumulated, mode="entropy")
yield prob_html, entropy_html
# JavaScript for token alternatives panel (loaded via launch js parameter)
TOKEN_ALTERNATIVES_JS = """
(function() {
document.addEventListener('click', function(e) {
var token = e.target.closest('[data-alternatives]');
if (!token || !token.dataset.alternatives) return;
var panel = document.getElementById('alternatives-panel');
if (!panel) return;
var prob = parseFloat(token.dataset.prob) || 0;
var alts = JSON.parse(token.dataset.alternatives);
var tokenText = token.textContent;
// Build panel content
var html = '<div style="font-weight:600;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #374151;">' +
'Selected: "<span style="color:#60a5fa">' + tokenText + '</span>" (' + (prob * 100).toFixed(2) + '%)' +
'</div>';
if (alts.length === 0) {
html += '<div style="color:#9ca3af;font-style:italic">No alternatives available</div>';
} else {
html += '<div style="font-size:12px;color:#9ca3af;margin-bottom:8px;">Top ' + Math.min(alts.length, 10) + ' alternatives:</div>';
for (var i = 0; i < Math.min(alts.length, 10); i++) {
var alt = alts[i];
var altProb = (alt.probability * 100).toFixed(2);
var barWidth = Math.max(alt.probability * 100, 1);
html += '<div style="display:flex;align-items:center;margin:6px 0;">' +
'<span style="width:80px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;font-family:monospace;">' +
alt.token.replace(/</g,'<').replace(/>/g,'>') + '</span>' +
'<span style="width:55px;text-align:right;color:#9ca3af;font-size:12px;margin-right:10px;">' +
altProb + '%</span>' +
'<div style="flex:1;height:10px;background:#374151;border-radius:5px;overflow:hidden;">' +
'<div style="width:' + barWidth + '%;height:100%;background:#60a5fa;border-radius:5px;"></div>' +
'</div></div>';
}
}
panel.innerHTML = html;
});
})();
"""
# Initial HTML for alternatives panel
ALTERNATIVES_PANEL_INITIAL = '''
<div id="alternatives-panel" style="
padding: 16px;
background: #1f2937;
border-radius: 8px;
color: #e5e7eb;
font-family: system-ui, -apple-system, sans-serif;
font-size: 14px;
min-height: 100px;
">
<div style="color: #9ca3af; font-style: italic;">
Click on any token above to see alternative predictions.
</div>
</div>
'''
# Build Gradio interface
with gr.Blocks(title="OCR Confidence Visualization") as demo:
gr.Markdown("# OCR Confidence Visualization")
gr.Markdown("Upload a document image to extract text with token streaming.")
with gr.Row():
with gr.Column(scale=1):
model_selector = gr.Radio(
choices=list(AVAILABLE_MODELS.keys()),
value=DEFAULT_MODEL,
label="Model",
)
image_input = gr.Image(type="pil", label="Upload Document")
submit_btn = gr.Button("Transcribe", variant="primary")
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("Probability"):
output_html_prob = gr.HTML(
value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
)
with gr.TabItem("Entropy"):
output_html_entropy = gr.HTML(
value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
)
gr.Markdown("### Token Alternatives")
alternatives_html = gr.HTML(
value=ALTERNATIVES_PANEL_INITIAL,
)
submit_btn.click(
fn=transcribe_streaming,
inputs=[image_input, model_selector],
outputs=[output_html_prob, output_html_entropy],
)
if __name__ == "__main__":
# Preload model at startup for local development
# On HuggingFace Spaces with ZeroGPU, model loading happens on first request
# when GPU is allocated by the @spaces.GPU decorator
if not SPACES_AVAILABLE:
print("Preloading model (local development)...")
load_model()
else:
print("ZeroGPU detected - model will load on first inference request")
print("Starting Gradio server...")
demo.launch(server_port=7860, js=TOKEN_ALTERNATIVES_JS)
|