"""
Agentic Image-to-Word Converter — Gradio 6 Web Application
A modern agentic OCR system that converts images to formatted Word documents
using docTR, LangGraph, and optional LLM intelligence.
"""
import os
import sys
import logging
import gradio as gr
from datetime import datetime
# Ensure project root is in path
sys.path.insert(0, os.path.dirname(__file__))
from agents.graph import run_agent
from memory.store import MemoryStore
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
memory = MemoryStore()
# ─── Custom CSS ────────────────────────────────────────────────────────────────
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
* { font-family: 'Inter', sans-serif !important; }
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
/* Header banner */
.header-banner {
background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #0f172a 100%);
border: 1px solid rgba(99, 102, 241, 0.3);
border-radius: 16px;
padding: 24px 32px;
margin-bottom: 16px;
position: relative;
overflow: hidden;
}
.header-banner::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle at 30% 50%, rgba(99, 102, 241, 0.08) 0%, transparent 50%);
animation: pulse-glow 4s ease-in-out infinite;
}
@keyframes pulse-glow {
0%, 100% { opacity: 0.5; }
50% { opacity: 1; }
}
.header-banner h1 {
color: #e2e8f0 !important;
font-size: 1.8em !important;
font-weight: 700 !important;
margin: 0 !important;
position: relative;
z-index: 1;
}
.header-banner p {
color: #94a3b8 !important;
font-size: 0.95em !important;
margin: 4px 0 0 !important;
position: relative;
z-index: 1;
}
/* Agent log styling */
.agent-log {
background: #0f172a;
border: 1px solid #1e293b;
border-radius: 12px;
padding: 16px;
font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
font-size: 0.85em;
line-height: 1.6;
max-height: 400px;
overflow-y: auto;
}
/* Status badges */
.status-excellent { color: #4ade80; font-weight: 600; }
.status-good { color: #60a5fa; font-weight: 600; }
.status-fair { color: #fbbf24; font-weight: 600; }
.status-poor { color: #f87171; font-weight: 600; }
/* Confidence bar */
.confidence-bar {
height: 8px;
border-radius: 4px;
background: #1e293b;
overflow: hidden;
margin: 8px 0;
}
.confidence-fill {
height: 100%;
border-radius: 4px;
transition: width 0.5s ease;
}
/* Card styling */
.settings-card {
background: linear-gradient(135deg, #1e293b, #0f172a);
border: 1px solid rgba(99, 102, 241, 0.2);
border-radius: 12px;
padding: 16px;
}
/* Preview area */
.preview-area {
background: #ffffff;
color: #1e293b;
border-radius: 12px;
padding: 20px;
min-height: 200px;
border: 1px solid #e2e8f0;
}
/* Tab styling override */
.tab-nav button {
font-weight: 500 !important;
font-size: 0.9em !important;
}
"""
# ─── Processing Function ──────────────────────────────────────────────────────
def process_image(image_path, api_key, llm_provider, progress=gr.Progress()):
"""Main processing function invoked by the UI."""
if image_path is None:
return (
None,
'
Please upload an image first.
',
"",
"",
""
)
progress(0.1, desc="Agent: Perceiving image...")
# Determine provider
provider = "none"
key = api_key.strip() if api_key else ""
if llm_provider == "Google Gemini" and key:
provider = "gemini"
elif llm_provider == "Ollama (Local)" and key:
provider = "ollama"
elif llm_provider == "Ollama (Local)" and not key:
provider = "ollama"
key = "http://localhost:11434"
try:
progress(0.2, desc="Agent: Analyzing image properties...")
# Run the full agentic workflow
result = run_agent(image_path, api_key=key, llm_provider=provider)
progress(0.9, desc="Agent: Finalizing...")
# Extract results
error = result.get("error")
if error:
return (
None,
f' {error}
',
"",
_format_agent_log(result.get("processing_log", [])),
""
)
docx_path = result.get("docx_path", "")
preview_html = result.get("preview_html", "")
raw_text = result.get("raw_text", "")
confidence = result.get("confidence_score", 0)
quality = result.get("quality_assessment", "unknown")
agent_log = _format_agent_log(result.get("processing_log", []))
# Build status HTML
status_html = _build_status_html(confidence, quality, result)
progress(1.0, desc="✅ Complete!")
return (
docx_path,
preview_html,
raw_text,
agent_log,
status_html
)
except Exception as e:
logger.error(f"Processing error: {e}", exc_info=True)
return (
None,
f'❌ Error: {str(e)}
',
"",
f"Error: {str(e)}",
""
)
def _build_status_html(confidence, quality, result):
"""Build status display HTML."""
conf_pct = int(confidence * 100)
color = (
"#4ade80" if confidence >= 0.8
else "#60a5fa" if confidence >= 0.7
else "#fbbf24" if confidence >= 0.5
else "#f87171"
)
engine = result.get("ocr_result", {}).get("engine_used", "Unknown")
num_paras = len(result.get("layout_analysis", {}).get("paragraphs", []))
props = result.get("image_properties", {})
html = f"""
{'Good' if confidence >= 0.7 else 'Bad' if confidence >= 0.5 else 'Failed'}
{quality.upper()} Quality — {conf_pct}% Confidence
OCR Engine: {engine} | Paragraphs: {num_paras}
Resolution: {props.get('width', '?')}×{props.get('height', '?')}
DPI: {props.get('resolution_dpi', '?')}
Contrast: {props.get('contrast_score', 0):.0%}
Noise: {props.get('noise_level', 0):.0%}
{'
⚠️ Low confidence — consider reviewing the extracted text before downloading.
' if result.get("needs_human_review") else ''}
"""
return html
def _format_agent_log(log):
"""Format agent processing log as readable text."""
if not log:
return "No processing log available."
lines = []
step_icons = {
"perceive": "",
"analyze": "",
"decide": "",
"act": "",
"learn": "",
}
for entry in log:
step = entry.get("step", "")
icon = step_icons.get(step, "▸")
action = entry.get("action", "")
detail = entry.get("detail", "")
ts = entry.get("timestamp", "")
time_str = ""
if ts:
try:
dt = datetime.fromisoformat(ts)
time_str = f"[{dt.strftime('%H:%M:%S')}] "
except Exception:
pass
line = f"{icon} {time_str}{action}"
if detail:
line += f"\n └─ {detail}"
lines.append(line)
return "\n".join(lines)
def get_history_html():
"""Get processing history as formatted HTML."""
stats = memory.get_stats()
history = memory.get_history(10)
if stats["total_processed"] == 0:
return 'No processing history yet. Convert an image to start building memory.
'
html = f"""
{stats['total_processed']}
Documents Processed
{stats['avg_confidence']:.0%}
Avg Confidence
{len(stats.get('quality_distribution', {}))}
Quality Levels
"""
if history:
html += '
Recent Activity
'
for record in reversed(history[-5:]):
ts = record.get("timestamp", record.get("saved_at", ""))
conf = record.get("confidence_score", 0)
quality = record.get("quality_assessment", "?")
engine = record.get("ocr_engine", "?")
n_paras = record.get("num_paragraphs", 0)
html += f"""
{engine} — {n_paras} paragraphs
{conf:.0%} confidence
{ts}
"""
html += '
'
return html
# ─── Build Gradio UI ──────────────────────────────────────────────────────────
def build_ui():
"""Build the Gradio 6 application."""
with gr.Blocks(
title="Agentic Image2Word — AI-Powered OCR Converter",
) as app:
# Header
gr.HTML("""
""")
with gr.Tabs() as tabs:
# ── Tab 1: Converter ──────────────────────────────────────────
with gr.TabItem("Convert", id="convert-tab"):
with gr.Row(equal_height=False):
# Left Column — Input & Settings
with gr.Column(scale=1):
img_input = gr.Image(
type="filepath",
label="Upload Image",
height=300,
sources=["upload", "clipboard"],
)
with gr.Accordion("⚙️ AI Settings", open=False):
llm_provider = gr.Dropdown(
choices=["None (Rule-Based)", "Google Gemini", "Ollama (Local)"],
value="None (Rule-Based)",
label="Intelligence Provider",
info="Select LLM for enhanced formatting",
)
api_key = gr.Textbox(
label="API Key / Ollama URL",
placeholder="Enter Gemini API key or Ollama URL...",
type="password",
info="Gemini: paste API key | Ollama: http://localhost:11434",
)
btn_convert = gr.Button(
"Convert with Agent",
variant="primary",
size="lg",
)
# Status display
status_html = gr.HTML(
value='Upload an image and click Convert to start.
',
label="Agent Status",
)
# Right Column — Output
with gr.Column(scale=1):
with gr.Tabs():
with gr.TabItem("Preview"):
preview_output = gr.HTML(
value='Document preview will appear here...
',
label="Document Preview",
)
with gr.TabItem("Raw Text"):
text_output = gr.Textbox(
label="Extracted Text (Editable)",
lines=12,
interactive=True,
placeholder="Extracted text will appear here. You can edit before downloading...",
info="Edit text here before generating the final document",
)
with gr.TabItem("Agent Log"):
agent_log = gr.Textbox(
label="Agent Decision Log",
lines=12,
interactive=False,
placeholder="Agent processing steps will appear here...",
)
file_output = gr.File(
label="📥 Download Word Document",
interactive=False,
)
# Wire up conversion
btn_convert.click(
fn=process_image,
inputs=[img_input, api_key, llm_provider],
outputs=[file_output, preview_output, text_output, agent_log, status_html],
)
# ── Tab 2: Memory & History ───────────────────────────────────
with gr.TabItem("History & Memory", id="history-tab"):
gr.HTML("""
Agent Memory
The agent learns from each document it processes, adapting its preprocessing
and formatting decisions based on past results.
""")
history_display = gr.HTML(value=get_history_html())
btn_refresh = gr.Button("Refresh History", size="sm")
btn_refresh.click(fn=get_history_html, outputs=[history_display])
# ── Tab 3: About ──────────────────────────────────────────────
with gr.TabItem("About", id="about-tab"):
gr.HTML("""
Agentic Image2Word Converter
This application transforms scanned documents and images into formatted
Word documents using an agentic AI architecture.
Architecture
Technology Stack
| OCR Engine |
docTR (Deep Learning) + Tesseract fallback |
| Agentic Framework |
LangGraph (Stateful AI Workflows) |
| LLM Intelligence |
Google Gemini / Ollama (Optional) |
| Web UI |
Gradio 6 |
| Document Output |
python-docx (.docx) |
Ethical Design
- Privacy: All processing is done server-side; no data is shared externally.
- Transparency: Full agent decision log visible in real-time.
- User Control: Human-in-the-loop editing before final document generation.
- Safety: Confidence scoring and quality alerts for low-quality results.
""")
return app
# ─── Entry Point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
app = build_ui()
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
theme=gr.themes.Soft(
primary_hue="indigo",
secondary_hue="slate",
neutral_hue="slate",
),
css=CUSTOM_CSS,
)