Spaces:

xeeshan404
/

agentic-image2word

Sleeping

App Files Files Community

agentic-image2word / app.py

xeeshan404

UI updates

471cec2 verified about 1 month ago

raw

history blame contribute delete

22.6 kB

	"""
	Agentic Image-to-Word Converter — Gradio 6 Web Application

	A modern agentic OCR system that converts images to formatted Word documents
	using docTR, LangGraph, and optional LLM intelligence.
	"""

	import os
	import sys
	import logging
	import gradio as gr
	from datetime import datetime

	# Ensure project root is in path
	sys.path.insert(0, os.path.dirname(__file__))

	from agents.graph import run_agent
	from memory.store import MemoryStore

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	memory = MemoryStore()

	# ─── Custom CSS ────────────────────────────────────────────────────────────────

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

	* { font-family: 'Inter', sans-serif !important; }

	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}

	/* Header banner */
	.header-banner {
	background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #0f172a 100%);
	border: 1px solid rgba(99, 102, 241, 0.3);
	border-radius: 16px;
	padding: 24px 32px;
	margin-bottom: 16px;
	position: relative;
	overflow: hidden;
	}
	.header-banner::before {
	content: '';
	position: absolute;
	top: -50%;
	left: -50%;
	width: 200%;
	height: 200%;
	background: radial-gradient(circle at 30% 50%, rgba(99, 102, 241, 0.08) 0%, transparent 50%);
	animation: pulse-glow 4s ease-in-out infinite;
	}
	@keyframes pulse-glow {
	0%, 100% { opacity: 0.5; }
	50% { opacity: 1; }
	}
	.header-banner h1 {
	color: #e2e8f0 !important;
	font-size: 1.8em !important;
	font-weight: 700 !important;
	margin: 0 !important;
	position: relative;
	z-index: 1;
	}
	.header-banner p {
	color: #94a3b8 !important;
	font-size: 0.95em !important;
	margin: 4px 0 0 !important;
	position: relative;
	z-index: 1;
	}

	/* Agent log styling */
	.agent-log {
	background: #0f172a;
	border: 1px solid #1e293b;
	border-radius: 12px;
	padding: 16px;
	font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
	font-size: 0.85em;
	line-height: 1.6;
	max-height: 400px;
	overflow-y: auto;
	}

	/* Status badges */
	.status-excellent { color: #4ade80; font-weight: 600; }
	.status-good { color: #60a5fa; font-weight: 600; }
	.status-fair { color: #fbbf24; font-weight: 600; }
	.status-poor { color: #f87171; font-weight: 600; }

	/* Confidence bar */
	.confidence-bar {
	height: 8px;
	border-radius: 4px;
	background: #1e293b;
	overflow: hidden;
	margin: 8px 0;
	}
	.confidence-fill {
	height: 100%;
	border-radius: 4px;
	transition: width 0.5s ease;
	}

	/* Card styling */
	.settings-card {
	background: linear-gradient(135deg, #1e293b, #0f172a);
	border: 1px solid rgba(99, 102, 241, 0.2);
	border-radius: 12px;
	padding: 16px;
	}

	/* Preview area */
	.preview-area {
	background: #ffffff;
	color: #1e293b;
	border-radius: 12px;
	padding: 20px;
	min-height: 200px;
	border: 1px solid #e2e8f0;
	}

	/* Tab styling override */
	.tab-nav button {
	font-weight: 500 !important;
	font-size: 0.9em !important;
	}
	"""


	# ─── Processing Function ──────────────────────────────────────────────────────

	def process_image(image_path, api_key, llm_provider, progress=gr.Progress()):
	"""Main processing function invoked by the UI."""
	if image_path is None:
	return (
	None,
	'<div style="color: #f87171; padding: 20px;"> Please upload an image first.</div>',
	"",
	"",
	""
	)

	progress(0.1, desc="Agent: Perceiving image...")

	# Determine provider
	provider = "none"
	key = api_key.strip() if api_key else ""
	if llm_provider == "Google Gemini" and key:
	provider = "gemini"
	elif llm_provider == "Ollama (Local)" and key:
	provider = "ollama"
	elif llm_provider == "Ollama (Local)" and not key:
	provider = "ollama"
	key = "http://localhost:11434"

	try:
	progress(0.2, desc="Agent: Analyzing image properties...")

	# Run the full agentic workflow
	result = run_agent(image_path, api_key=key, llm_provider=provider)

	progress(0.9, desc="Agent: Finalizing...")

	# Extract results
	error = result.get("error")
	if error:
	return (
	None,
	f'<div style="color: #f87171; padding: 20px;"> {error}</div>',
	"",
	_format_agent_log(result.get("processing_log", [])),
	""
	)

	docx_path = result.get("docx_path", "")
	preview_html = result.get("preview_html", "")
	raw_text = result.get("raw_text", "")
	confidence = result.get("confidence_score", 0)
	quality = result.get("quality_assessment", "unknown")
	agent_log = _format_agent_log(result.get("processing_log", []))

	# Build status HTML
	status_html = _build_status_html(confidence, quality, result)

	progress(1.0, desc="✅ Complete!")

	return (
	docx_path,
	preview_html,
	raw_text,
	agent_log,
	status_html
	)

	except Exception as e:
	logger.error(f"Processing error: {e}", exc_info=True)
	return (
	None,
	f'<div style="color: #f87171; padding: 20px;">❌ Error: {str(e)}</div>',
	"",
	f"Error: {str(e)}",
	""
	)


	def _build_status_html(confidence, quality, result):
	"""Build status display HTML."""
	conf_pct = int(confidence * 100)
	color = (
	"#4ade80" if confidence >= 0.8
	else "#60a5fa" if confidence >= 0.7
	else "#fbbf24" if confidence >= 0.5
	else "#f87171"
	)
	engine = result.get("ocr_result", {}).get("engine_used", "Unknown")
	num_paras = len(result.get("layout_analysis", {}).get("paragraphs", []))
	props = result.get("image_properties", {})

	html = f"""
	<div style="padding: 16px; font-family: Inter, sans-serif;">
	<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 12px;">
	<span style="font-size: 1.5em;">{'Good' if confidence >= 0.7 else 'Bad' if confidence >= 0.5 else 'Failed'}</span>
	<div>
	<div style="font-weight: 600; font-size: 1.1em; color: {color};">
	{quality.upper()} Quality — {conf_pct}% Confidence
	</div>
	<div style="color: #94a3b8; font-size: 0.85em;">
	OCR Engine: {engine} \| Paragraphs: {num_paras}
	</div>
	</div>
	</div>
	<div class="confidence-bar">
	<div class="confidence-fill" style="width: {conf_pct}%; background: {color};"></div>
	</div>
	<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 8px; margin-top: 12px; font-size: 0.85em; color: #94a3b8;">
	<div>Resolution: {props.get('width', '?')}×{props.get('height', '?')}</div>
	<div>DPI: {props.get('resolution_dpi', '?')}</div>
	<div>Contrast: {props.get('contrast_score', 0):.0%}</div>
	<div>Noise: {props.get('noise_level', 0):.0%}</div>
	</div>
	{'<div style="margin-top: 12px; padding: 8px; background: rgba(251, 191, 36, 0.1); border-radius: 8px; color: #fbbf24; font-size: 0.85em;">⚠️ Low confidence — consider reviewing the extracted text before downloading.</div>' if result.get("needs_human_review") else ''}
	</div>
	"""
	return html


	def _format_agent_log(log):
	"""Format agent processing log as readable text."""
	if not log:
	return "No processing log available."

	lines = []
	step_icons = {
	"perceive": "",
	"analyze": "",
	"decide": "",
	"act": "",
	"learn": "",
	}

	for entry in log:
	step = entry.get("step", "")
	icon = step_icons.get(step, "▸")
	action = entry.get("action", "")
	detail = entry.get("detail", "")
	ts = entry.get("timestamp", "")

	time_str = ""
	if ts:
	try:
	dt = datetime.fromisoformat(ts)
	time_str = f"[{dt.strftime('%H:%M:%S')}] "
	except Exception:
	pass

	line = f"{icon} {time_str}{action}"
	if detail:
	line += f"\n └─ {detail}"
	lines.append(line)

	return "\n".join(lines)


	def get_history_html():
	"""Get processing history as formatted HTML."""
	stats = memory.get_stats()
	history = memory.get_history(10)

	if stats["total_processed"] == 0:
	return '<div style="padding: 20px; color: #94a3b8; text-align: center;">No processing history yet. Convert an image to start building memory.</div>'

	html = f"""
	<div style="padding: 16px;">
	<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; margin-bottom: 16px;">
	<div style="background: rgba(99, 102, 241, 0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em; font-weight: 700; color: #818cf8;">{stats['total_processed']}</div>
	<div style="color: #94a3b8; font-size: 0.85em;">Documents Processed</div>
	</div>
	<div style="background: rgba(74, 222, 128, 0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em; font-weight: 700; color: #4ade80;">{stats['avg_confidence']:.0%}</div>
	<div style="color: #94a3b8; font-size: 0.85em;">Avg Confidence</div>
	</div>
	<div style="background: rgba(96, 165, 250, 0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em; font-weight: 700; color: #60a5fa;">{len(stats.get('quality_distribution', {}))}</div>
	<div style="color: #94a3b8; font-size: 0.85em;">Quality Levels</div>
	</div>
	</div>
	"""

	if history:
	html += '<div style="font-weight: 600; margin-bottom: 8px; color: #e2e8f0;">Recent Activity</div>'
	for record in reversed(history[-5:]):
	ts = record.get("timestamp", record.get("saved_at", ""))
	conf = record.get("confidence_score", 0)
	quality = record.get("quality_assessment", "?")
	engine = record.get("ocr_engine", "?")
	n_paras = record.get("num_paragraphs", 0)

	html += f"""
	<div style="border: 1px solid #1e293b; border-radius: 8px; padding: 10px; margin-bottom: 6px; font-size: 0.85em;">
	<div style="display: flex; justify-content: space-between;">
	<span style="color: #e2e8f0;">{engine} — {n_paras} paragraphs</span>
	<span style="color: #94a3b8;">{conf:.0%} confidence</span>
	</div>
	<div style="color: #64748b; font-size: 0.8em;">{ts}</div>
	</div>
	"""

	html += '</div>'
	return html


	# ─── Build Gradio UI ──────────────────────────────────────────────────────────

	def build_ui():
	"""Build the Gradio 6 application."""

	with gr.Blocks(
	title="Agentic Image2Word — AI-Powered OCR Converter",
	) as app:

	# Header
	gr.HTML("""
	<div class="header-banner">
	<h1>RayXar - Image2Word Converter</h1>
	<p>AI-powered document conversion with adaptive OCR, intelligent formatting, and learning memory</p>
	</div>
	""")

	with gr.Tabs() as tabs:

	# ── Tab 1: Converter ──────────────────────────────────────────
	with gr.TabItem("Convert", id="convert-tab"):
	with gr.Row(equal_height=False):

	# Left Column — Input & Settings
	with gr.Column(scale=1):
	img_input = gr.Image(
	type="filepath",
	label="Upload Image",
	height=300,
	sources=["upload", "clipboard"],
	)

	with gr.Accordion("⚙️ AI Settings", open=False):
	llm_provider = gr.Dropdown(
	choices=["None (Rule-Based)", "Google Gemini", "Ollama (Local)"],
	value="None (Rule-Based)",
	label="Intelligence Provider",
	info="Select LLM for enhanced formatting",
	)
	api_key = gr.Textbox(
	label="API Key / Ollama URL",
	placeholder="Enter Gemini API key or Ollama URL...",
	type="password",
	info="Gemini: paste API key \| Ollama: http://localhost:11434",
	)

	btn_convert = gr.Button(
	"Convert with Agent",
	variant="primary",
	size="lg",
	)

	# Status display
	status_html = gr.HTML(
	value='<div style="padding: 16px; color: #94a3b8; text-align: center;">Upload an image and click Convert to start.</div>',
	label="Agent Status",
	)

	# Right Column — Output
	with gr.Column(scale=1):
	with gr.Tabs():
	with gr.TabItem("Preview"):
	preview_output = gr.HTML(
	value='<div style="padding: 40px; color: #94a3b8; text-align: center; font-style: italic;">Document preview will appear here...</div>',
	label="Document Preview",
	)

	with gr.TabItem("Raw Text"):
	text_output = gr.Textbox(
	label="Extracted Text (Editable)",
	lines=12,
	interactive=True,
	placeholder="Extracted text will appear here. You can edit before downloading...",
	info="Edit text here before generating the final document",
	)

	with gr.TabItem("Agent Log"):
	agent_log = gr.Textbox(
	label="Agent Decision Log",
	lines=12,
	interactive=False,
	placeholder="Agent processing steps will appear here...",
	)

	file_output = gr.File(
	label="📥 Download Word Document",
	interactive=False,
	)

	# Wire up conversion
	btn_convert.click(
	fn=process_image,
	inputs=[img_input, api_key, llm_provider],
	outputs=[file_output, preview_output, text_output, agent_log, status_html],
	)

	# ── Tab 2: Memory & History ───────────────────────────────────
	with gr.TabItem("History & Memory", id="history-tab"):
	gr.HTML("""
	<div style="padding: 12px 0;">
	<h3 style="margin: 0; color: #e2e8f0;">Agent Memory</h3>
	<p style="color: #94a3b8; margin: 4px 0 0;">
	The agent learns from each document it processes, adapting its preprocessing
	and formatting decisions based on past results.
	</p>
	</div>
	""")
	history_display = gr.HTML(value=get_history_html())
	btn_refresh = gr.Button("Refresh History", size="sm")
	btn_refresh.click(fn=get_history_html, outputs=[history_display])

	# ── Tab 3: About ──────────────────────────────────────────────
	with gr.TabItem("About", id="about-tab"):
	gr.HTML("""
	<div style="padding: 20px; line-height: 1.8;">
	<h2 style="color: #e2e8f0;">Agentic Image2Word Converter</h2>
	<p style="color: #94a3b8;">
	This application transforms scanned documents and images into formatted
	Word documents using an <b>agentic AI architecture</b>.
	</p>

	<h3 style="color: #818cf8; margin-top: 20px;">Architecture</h3>
	<div style="display: grid; grid-template-columns: repeat(5, 1fr); gap: 8px; margin: 12px 0;">
	<div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em;"></div>
	<div style="font-weight: 600; color: #e2e8f0;">Perceive</div>
	<div style="font-size: 0.75em; color: #94a3b8;">Analyze image</div>
	</div>
	<div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em;"></div>
	<div style="font-weight: 600; color: #e2e8f0;">Analyze</div>
	<div style="font-size: 0.75em; color: #94a3b8;">Run OCR</div>
	</div>
	<div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em;"></div>
	<div style="font-weight: 600; color: #e2e8f0;">Decide</div>
	<div style="font-size: 0.75em; color: #94a3b8;">Format text</div>
	</div>
	<div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em;"></div>
	<div style="font-weight: 600; color: #e2e8f0;">Act</div>
	<div style="font-size: 0.75em; color: #94a3b8;">Generate DOCX</div>
	</div>
	<div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;">
	<div style="font-size: 1.5em;"></div>
	<div style="font-weight: 600; color: #e2e8f0;">Learn</div>
	<div style="font-size: 0.75em; color: #94a3b8;">Save memory</div>
	</div>
	</div>

	<h3 style="color: #818cf8; margin-top: 20px;">Technology Stack</h3>
	<table style="width: 100%; color: #e2e8f0; border-collapse: collapse; margin: 12px 0;">
	<tr style="border-bottom: 1px solid #1e293b;">
	<td style="padding: 8px; font-weight: 600;">OCR Engine</td>
	<td style="padding: 8px; color: #94a3b8;">docTR (Deep Learning) + Tesseract fallback</td>
	</tr>
	<tr style="border-bottom: 1px solid #1e293b;">
	<td style="padding: 8px; font-weight: 600;">Agentic Framework</td>
	<td style="padding: 8px; color: #94a3b8;">LangGraph (Stateful AI Workflows)</td>
	</tr>
	<tr style="border-bottom: 1px solid #1e293b;">
	<td style="padding: 8px; font-weight: 600;">LLM Intelligence</td>
	<td style="padding: 8px; color: #94a3b8;">Google Gemini / Ollama (Optional)</td>
	</tr>
	<tr style="border-bottom: 1px solid #1e293b;">
	<td style="padding: 8px; font-weight: 600;">Web UI</td>
	<td style="padding: 8px; color: #94a3b8;">Gradio 6</td>
	</tr>
	<tr>
	<td style="padding: 8px; font-weight: 600;">Document Output</td>
	<td style="padding: 8px; color: #94a3b8;">python-docx (.docx)</td>
	</tr>
	</table>

	<h3 style="color: #818cf8; margin-top: 20px;">Ethical Design</h3>
	<ul style="color: #94a3b8;">
	<li><b>Privacy:</b> All processing is done server-side; no data is shared externally.</li>
	<li><b>Transparency:</b> Full agent decision log visible in real-time.</li>
	<li><b>User Control:</b> Human-in-the-loop editing before final document generation.</li>
	<li><b>Safety:</b> Confidence scoring and quality alerts for low-quality results.</li>
	</ul>
	</div>
	""")

	return app


	# ─── Entry Point ───────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	app = build_ui()
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	theme=gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="slate",
	neutral_hue="slate",
	),
	css=CUSTOM_CSS,
	)