ocr-entropy / app.py
ryandt's picture
Update app.py
c6f6682 verified
"""
OCR Confidence Visualization - Gradio Application.
Upload a document image to extract text with confidence visualization.
Supports deployment to HuggingFace Spaces with ZeroGPU via @spaces.GPU decorator.
The decorator is effect-free in non-ZeroGPU environments for local development.
"""
import html
import json
from typing import Generator
import gradio as gr
from PIL import Image
# Import spaces for ZeroGPU support (effect-free outside HuggingFace Spaces)
try:
import spaces
SPACES_AVAILABLE = True
except ImportError:
SPACES_AVAILABLE = False
from model import generate_with_logprobs, load_model, TokenData, AVAILABLE_MODELS, DEFAULT_MODEL
def gpu_decorator(duration: int = 120):
"""
Return @spaces.GPU decorator if available, otherwise a no-op decorator.
This allows the code to work both locally and on HuggingFace Spaces.
"""
if SPACES_AVAILABLE:
return spaces.GPU(duration=duration)
return lambda fn: fn
def probability_to_color(prob: float) -> str:
"""
Map probability to a color for text and underline styling.
Args:
prob: Confidence probability (0.0 to 1.0)
Returns:
Hex color string
"""
if prob > 0.99:
return "#3b82f6" # Blue - very high confidence
elif prob > 0.95:
return "#16a34a" # Dark Green - high confidence
elif prob > 0.85:
return "#65a30d" # Darker Light Green - good confidence (darkened for readability)
elif prob > 0.70:
return "#ca8a04" # Darker Yellow - moderate confidence (darkened for readability)
elif prob > 0.50:
return "#ef4444" # Red - low confidence
else:
return "#a855f7" # Purple - very low confidence
def entropy_to_color(entropy: float) -> str:
"""
Map entropy (in bits) to a color for visualization.
Higher entropy = more uncertainty = warmer colors.
Args:
entropy: Shannon entropy in bits (0.0 = certain)
Returns:
Hex color string
"""
if entropy < 0.1:
return "#3b82f6" # Blue - very certain
elif entropy < 0.3:
return "#16a34a" # Dark Green - certain
elif entropy < 0.7:
return "#65a30d" # Green - fairly certain
elif entropy < 1.5:
return "#ca8a04" # Amber - some uncertainty
elif entropy < 2.5:
return "#ef4444" # Red - uncertain
else:
return "#a855f7" # Purple - very uncertain
def build_html_output(tokens: list[TokenData], mode: str = "probability") -> str:
"""
Build HTML output from accumulated tokens with confidence coloring.
Args:
tokens: List of TokenData objects
mode: "probability" for confidence coloring, "entropy" for uncertainty coloring
Returns:
HTML string with styled token spans
"""
# Font stack with emoji support
font_family = "'Cascadia Code', 'Fira Code', Consolas, monospace, 'Apple Color Emoji', 'Segoe UI Emoji', 'Noto Color Emoji'"
# CSS for hover underline effect
style_tag = '<style>.token-span:hover { text-decoration: underline !important; }</style>'
if not tokens:
return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.8; padding: 10px;"></div>'
spans = []
for token_data in tokens:
# Escape HTML entities in token text
token_text = html.escape(token_data.token)
# Handle newlines - convert to <br>
if "\n" in token_text:
token_text = token_text.replace("\n", "<br>")
spans.append(token_text)
else:
# Get color based on mode
if mode == "entropy":
color = entropy_to_color(token_data.entropy)
else:
color = probability_to_color(token_data.probability)
# Encode alternatives as JSON for data attribute
alternatives_json = html.escape(json.dumps(token_data.alternatives))
# Build styled span with color (underline on hover via CSS)
span = (
f'<span class="token-span" style="color: {color}; '
f'text-decoration-color: {color}; cursor: pointer;" '
f'data-prob="{token_data.probability}" '
f'data-entropy="{token_data.entropy}" '
f'data-alternatives="{alternatives_json}">'
f'{token_text}</span>'
)
spans.append(span)
html_content = "".join(spans)
return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.6; padding: 10px; white-space: pre-wrap;">{html_content}</div>'
@gpu_decorator(duration=120)
def transcribe_full(image: Image.Image, model_name: str = None) -> list[TokenData]:
"""
Run full OCR inference on GPU and return all tokens.
On HuggingFace Spaces with ZeroGPU, this function is decorated with
@spaces.GPU to allocate GPU resources for the duration of inference.
The GPU is released when the function returns.
Args:
image: PIL Image to process
model_name: Which model to use for inference
Returns:
List of TokenData with token strings, probabilities, and alternatives
"""
return list(generate_with_logprobs(image, model_name=model_name))
def transcribe_streaming(image: Image.Image, model_name: str = None) -> Generator[tuple[str, str], None, None]:
"""
Stream OCR transcription with progressive HTML output for both views.
This function separates GPU-bound inference from HTML rendering:
1. Shows a "Processing..." indicator during inference
2. Runs full inference in a single GPU-decorated call
3. Streams HTML rendering from pre-computed tokens (no GPU needed)
This architecture is required for HuggingFace ZeroGPU, which allocates
GPU resources per decorated function call rather than for streaming.
Args:
image: PIL Image to process
model_name: Which model to use for inference
Yields:
Tuple of (probability_html, entropy_html) as tokens stream
"""
if image is None:
empty = '<div style="color: #666; padding: 10px;">Please upload an image.</div>'
yield empty, empty
return
# Show processing indicator during GPU inference
loading = f'''<div style="color: #60a5fa; padding: 10px; display: flex; align-items: center; gap: 10px;">
<div style="width: 20px; height: 20px; border: 2px solid #60a5fa; border-top-color: transparent; border-radius: 50%; animation: spin 1s linear infinite;"></div>
<style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>
Processing image with {model_name or DEFAULT_MODEL}...
</div>'''
yield loading, loading
# Run full inference (GPU allocated here on ZeroGPU)
tokens = transcribe_full(image, model_name=model_name)
# Stream HTML rendering (no GPU needed)
accumulated: list[TokenData] = []
for token in tokens:
accumulated.append(token)
prob_html = build_html_output(accumulated, mode="probability")
entropy_html = build_html_output(accumulated, mode="entropy")
yield prob_html, entropy_html
# JavaScript for token alternatives panel (loaded via launch js parameter)
TOKEN_ALTERNATIVES_JS = """
(function() {
document.addEventListener('click', function(e) {
var token = e.target.closest('[data-alternatives]');
if (!token || !token.dataset.alternatives) return;
var panel = document.getElementById('alternatives-panel');
if (!panel) return;
var prob = parseFloat(token.dataset.prob) || 0;
var alts = JSON.parse(token.dataset.alternatives);
var tokenText = token.textContent;
// Build panel content
var html = '<div style="font-weight:600;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #374151;">' +
'Selected: "<span style="color:#60a5fa">' + tokenText + '</span>" (' + (prob * 100).toFixed(2) + '%)' +
'</div>';
if (alts.length === 0) {
html += '<div style="color:#9ca3af;font-style:italic">No alternatives available</div>';
} else {
html += '<div style="font-size:12px;color:#9ca3af;margin-bottom:8px;">Top ' + Math.min(alts.length, 10) + ' alternatives:</div>';
for (var i = 0; i < Math.min(alts.length, 10); i++) {
var alt = alts[i];
var altProb = (alt.probability * 100).toFixed(2);
var barWidth = Math.max(alt.probability * 100, 1);
html += '<div style="display:flex;align-items:center;margin:6px 0;">' +
'<span style="width:80px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;font-family:monospace;">' +
alt.token.replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>' +
'<span style="width:55px;text-align:right;color:#9ca3af;font-size:12px;margin-right:10px;">' +
altProb + '%</span>' +
'<div style="flex:1;height:10px;background:#374151;border-radius:5px;overflow:hidden;">' +
'<div style="width:' + barWidth + '%;height:100%;background:#60a5fa;border-radius:5px;"></div>' +
'</div></div>';
}
}
panel.innerHTML = html;
});
})();
"""
# Initial HTML for alternatives panel
ALTERNATIVES_PANEL_INITIAL = '''
<div id="alternatives-panel" style="
padding: 16px;
background: #1f2937;
border-radius: 8px;
color: #e5e7eb;
font-family: system-ui, -apple-system, sans-serif;
font-size: 14px;
min-height: 100px;
">
<div style="color: #9ca3af; font-style: italic;">
Click on any token above to see alternative predictions.
</div>
</div>
'''
# Build Gradio interface
with gr.Blocks(title="OCR Confidence Visualization") as demo:
gr.Markdown("# OCR Confidence Visualization")
gr.Markdown("Upload a document image to extract text with token streaming.")
with gr.Row():
with gr.Column(scale=1):
model_selector = gr.Radio(
choices=list(AVAILABLE_MODELS.keys()),
value=DEFAULT_MODEL,
label="Model",
)
image_input = gr.Image(type="pil", label="Upload Document")
submit_btn = gr.Button("Transcribe", variant="primary")
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("Probability"):
output_html_prob = gr.HTML(
value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
)
with gr.TabItem("Entropy"):
output_html_entropy = gr.HTML(
value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
)
gr.Markdown("### Token Alternatives")
alternatives_html = gr.HTML(
value=ALTERNATIVES_PANEL_INITIAL,
)
submit_btn.click(
fn=transcribe_streaming,
inputs=[image_input, model_selector],
outputs=[output_html_prob, output_html_entropy],
)
if __name__ == "__main__":
# Preload model at startup for local development
# On HuggingFace Spaces with ZeroGPU, model loading happens on first request
# when GPU is allocated by the @spaces.GPU decorator
if not SPACES_AVAILABLE:
print("Preloading model (local development)...")
load_model()
else:
print("ZeroGPU detected - model will load on first inference request")
print("Starting Gradio server...")
demo.launch(server_port=7860, js=TOKEN_ALTERNATIVES_JS)