Spaces:

ryandt
/

ocr-entropy

Sleeping

App Files Files Community

ocr-entropy / app.py

ryandt

Update app.py

c6f6682 verified 12 days ago

raw

history blame contribute delete

11.8 kB

	"""
	OCR Confidence Visualization - Gradio Application.

	Upload a document image to extract text with confidence visualization.

	Supports deployment to HuggingFace Spaces with ZeroGPU via @spaces.GPU decorator.
	The decorator is effect-free in non-ZeroGPU environments for local development.
	"""

	import html
	import json
	from typing import Generator

	import gradio as gr
	from PIL import Image

	# Import spaces for ZeroGPU support (effect-free outside HuggingFace Spaces)
	try:
	import spaces
	SPACES_AVAILABLE = True
	except ImportError:
	SPACES_AVAILABLE = False

	from model import generate_with_logprobs, load_model, TokenData, AVAILABLE_MODELS, DEFAULT_MODEL


	def gpu_decorator(duration: int = 120):
	"""
	Return @spaces.GPU decorator if available, otherwise a no-op decorator.

	This allows the code to work both locally and on HuggingFace Spaces.
	"""
	if SPACES_AVAILABLE:
	return spaces.GPU(duration=duration)
	return lambda fn: fn


	def probability_to_color(prob: float) -> str:
	"""
	Map probability to a color for text and underline styling.

	Args:
	prob: Confidence probability (0.0 to 1.0)

	Returns:
	Hex color string
	"""
	if prob > 0.99:
	return "#3b82f6" # Blue - very high confidence
	elif prob > 0.95:
	return "#16a34a" # Dark Green - high confidence
	elif prob > 0.85:
	return "#65a30d" # Darker Light Green - good confidence (darkened for readability)
	elif prob > 0.70:
	return "#ca8a04" # Darker Yellow - moderate confidence (darkened for readability)
	elif prob > 0.50:
	return "#ef4444" # Red - low confidence
	else:
	return "#a855f7" # Purple - very low confidence


	def entropy_to_color(entropy: float) -> str:
	"""
	Map entropy (in bits) to a color for visualization.

	Higher entropy = more uncertainty = warmer colors.

	Args:
	entropy: Shannon entropy in bits (0.0 = certain)

	Returns:
	Hex color string
	"""
	if entropy < 0.1:
	return "#3b82f6" # Blue - very certain
	elif entropy < 0.3:
	return "#16a34a" # Dark Green - certain
	elif entropy < 0.7:
	return "#65a30d" # Green - fairly certain
	elif entropy < 1.5:
	return "#ca8a04" # Amber - some uncertainty
	elif entropy < 2.5:
	return "#ef4444" # Red - uncertain
	else:
	return "#a855f7" # Purple - very uncertain


	def build_html_output(tokens: list[TokenData], mode: str = "probability") -> str:
	"""
	Build HTML output from accumulated tokens with confidence coloring.

	Args:
	tokens: List of TokenData objects
	mode: "probability" for confidence coloring, "entropy" for uncertainty coloring

	Returns:
	HTML string with styled token spans
	"""
	# Font stack with emoji support
	font_family = "'Cascadia Code', 'Fira Code', Consolas, monospace, 'Apple Color Emoji', 'Segoe UI Emoji', 'Noto Color Emoji'"

	# CSS for hover underline effect
	style_tag = '<style>.token-span:hover { text-decoration: underline !important; }</style>'

	if not tokens:
	return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.8; padding: 10px;"></div>'

	spans = []
	for token_data in tokens:
	# Escape HTML entities in token text
	token_text = html.escape(token_data.token)

	# Handle newlines - convert to <br>
	if "\n" in token_text:
	token_text = token_text.replace("\n", "<br>")
	spans.append(token_text)
	else:
	# Get color based on mode
	if mode == "entropy":
	color = entropy_to_color(token_data.entropy)
	else:
	color = probability_to_color(token_data.probability)

	# Encode alternatives as JSON for data attribute
	alternatives_json = html.escape(json.dumps(token_data.alternatives))

	# Build styled span with color (underline on hover via CSS)
	span = (
	f'<span class="token-span" style="color: {color}; '
	f'text-decoration-color: {color}; cursor: pointer;" '
	f'data-prob="{token_data.probability}" '
	f'data-entropy="{token_data.entropy}" '
	f'data-alternatives="{alternatives_json}">'
	f'{token_text}</span>'
	)
	spans.append(span)

	html_content = "".join(spans)
	return f'{style_tag}<div class="token-container" style="font-family: {font_family}; line-height: 1.6; padding: 10px; white-space: pre-wrap;">{html_content}</div>'


	@gpu_decorator(duration=120)
	def transcribe_full(image: Image.Image, model_name: str = None) -> list[TokenData]:
	"""
	Run full OCR inference on GPU and return all tokens.

	On HuggingFace Spaces with ZeroGPU, this function is decorated with
	@spaces.GPU to allocate GPU resources for the duration of inference.
	The GPU is released when the function returns.

	Args:
	image: PIL Image to process
	model_name: Which model to use for inference

	Returns:
	List of TokenData with token strings, probabilities, and alternatives
	"""
	return list(generate_with_logprobs(image, model_name=model_name))


	def transcribe_streaming(image: Image.Image, model_name: str = None) -> Generator[tuple[str, str], None, None]:
	"""
	Stream OCR transcription with progressive HTML output for both views.

	This function separates GPU-bound inference from HTML rendering:
	1. Shows a "Processing..." indicator during inference
	2. Runs full inference in a single GPU-decorated call
	3. Streams HTML rendering from pre-computed tokens (no GPU needed)

	This architecture is required for HuggingFace ZeroGPU, which allocates
	GPU resources per decorated function call rather than for streaming.

	Args:
	image: PIL Image to process
	model_name: Which model to use for inference

	Yields:
	Tuple of (probability_html, entropy_html) as tokens stream
	"""
	if image is None:
	empty = '<div style="color: #666; padding: 10px;">Please upload an image.</div>'
	yield empty, empty
	return

	# Show processing indicator during GPU inference
	loading = f'''<div style="color: #60a5fa; padding: 10px; display: flex; align-items: center; gap: 10px;">
	<div style="width: 20px; height: 20px; border: 2px solid #60a5fa; border-top-color: transparent; border-radius: 50%; animation: spin 1s linear infinite;"></div>
	<style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>
	Processing image with {model_name or DEFAULT_MODEL}...
	</div>'''
	yield loading, loading

	# Run full inference (GPU allocated here on ZeroGPU)
	tokens = transcribe_full(image, model_name=model_name)

	# Stream HTML rendering (no GPU needed)
	accumulated: list[TokenData] = []
	for token in tokens:
	accumulated.append(token)
	prob_html = build_html_output(accumulated, mode="probability")
	entropy_html = build_html_output(accumulated, mode="entropy")
	yield prob_html, entropy_html


	# JavaScript for token alternatives panel (loaded via launch js parameter)
	TOKEN_ALTERNATIVES_JS = """
	(function() {
	document.addEventListener('click', function(e) {
	var token = e.target.closest('[data-alternatives]');
	if (!token \|\| !token.dataset.alternatives) return;

	var panel = document.getElementById('alternatives-panel');
	if (!panel) return;

	var prob = parseFloat(token.dataset.prob) \|\| 0;
	var alts = JSON.parse(token.dataset.alternatives);
	var tokenText = token.textContent;

	// Build panel content
	var html = '<div style="font-weight:600;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #374151;">' +
	'Selected: "<span style="color:#60a5fa">' + tokenText + '</span>" (' + (prob * 100).toFixed(2) + '%)' +
	'</div>';

	if (alts.length === 0) {
	html += '<div style="color:#9ca3af;font-style:italic">No alternatives available</div>';
	} else {
	html += '<div style="font-size:12px;color:#9ca3af;margin-bottom:8px;">Top ' + Math.min(alts.length, 10) + ' alternatives:</div>';
	for (var i = 0; i < Math.min(alts.length, 10); i++) {
	var alt = alts[i];
	var altProb = (alt.probability * 100).toFixed(2);
	var barWidth = Math.max(alt.probability * 100, 1);
	html += '<div style="display:flex;align-items:center;margin:6px 0;">' +
	'<span style="width:80px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;font-family:monospace;">' +
	alt.token.replace(/</g,'<').replace(/>/g,'>') + '</span>' +
	'<span style="width:55px;text-align:right;color:#9ca3af;font-size:12px;margin-right:10px;">' +
	altProb + '%</span>' +
	'<div style="flex:1;height:10px;background:#374151;border-radius:5px;overflow:hidden;">' +
	'<div style="width:' + barWidth + '%;height:100%;background:#60a5fa;border-radius:5px;"></div>' +
	'</div></div>';
	}
	}

	panel.innerHTML = html;
	});
	})();
	"""

	# Initial HTML for alternatives panel
	ALTERNATIVES_PANEL_INITIAL = '''
	<div id="alternatives-panel" style="
	padding: 16px;
	background: #1f2937;
	border-radius: 8px;
	color: #e5e7eb;
	font-family: system-ui, -apple-system, sans-serif;
	font-size: 14px;
	min-height: 100px;
	">
	<div style="color: #9ca3af; font-style: italic;">
	Click on any token above to see alternative predictions.
	</div>
	</div>
	'''

	# Build Gradio interface
	with gr.Blocks(title="OCR Confidence Visualization") as demo:
	gr.Markdown("# OCR Confidence Visualization")
	gr.Markdown("Upload a document image to extract text with token streaming.")

	with gr.Row():
	with gr.Column(scale=1):
	model_selector = gr.Radio(
	choices=list(AVAILABLE_MODELS.keys()),
	value=DEFAULT_MODEL,
	label="Model",
	)
	image_input = gr.Image(type="pil", label="Upload Document")
	submit_btn = gr.Button("Transcribe", variant="primary")

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("Probability"):
	output_html_prob = gr.HTML(
	value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
	)
	with gr.TabItem("Entropy"):
	output_html_entropy = gr.HTML(
	value='<div style="color: #666; padding: 10px;">Upload an image and click Transcribe to start.</div>',
	)
	gr.Markdown("### Token Alternatives")
	alternatives_html = gr.HTML(
	value=ALTERNATIVES_PANEL_INITIAL,
	)

	submit_btn.click(
	fn=transcribe_streaming,
	inputs=[image_input, model_selector],
	outputs=[output_html_prob, output_html_entropy],
	)


	if __name__ == "__main__":
	# Preload model at startup for local development
	# On HuggingFace Spaces with ZeroGPU, model loading happens on first request
	# when GPU is allocated by the @spaces.GPU decorator
	if not SPACES_AVAILABLE:
	print("Preloading model (local development)...")
	load_model()
	else:
	print("ZeroGPU detected - model will load on first inference request")
	print("Starting Gradio server...")
	demo.launch(server_port=7860, js=TOKEN_ALTERNATIVES_JS)