""" Agentic Image-to-Word Converter — Gradio 6 Web Application A modern agentic OCR system that converts images to formatted Word documents using docTR, LangGraph, and optional LLM intelligence. """ import os import sys import logging import gradio as gr from datetime import datetime # Ensure project root is in path sys.path.insert(0, os.path.dirname(__file__)) from agents.graph import run_agent from memory.store import MemoryStore logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) memory = MemoryStore() # ─── Custom CSS ──────────────────────────────────────────────────────────────── CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); * { font-family: 'Inter', sans-serif !important; } .gradio-container { max-width: 1200px !important; margin: auto !important; } /* Header banner */ .header-banner { background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #0f172a 100%); border: 1px solid rgba(99, 102, 241, 0.3); border-radius: 16px; padding: 24px 32px; margin-bottom: 16px; position: relative; overflow: hidden; } .header-banner::before { content: ''; position: absolute; top: -50%; left: -50%; width: 200%; height: 200%; background: radial-gradient(circle at 30% 50%, rgba(99, 102, 241, 0.08) 0%, transparent 50%); animation: pulse-glow 4s ease-in-out infinite; } @keyframes pulse-glow { 0%, 100% { opacity: 0.5; } 50% { opacity: 1; } } .header-banner h1 { color: #e2e8f0 !important; font-size: 1.8em !important; font-weight: 700 !important; margin: 0 !important; position: relative; z-index: 1; } .header-banner p { color: #94a3b8 !important; font-size: 0.95em !important; margin: 4px 0 0 !important; position: relative; z-index: 1; } /* Agent log styling */ .agent-log { background: #0f172a; border: 1px solid #1e293b; border-radius: 12px; padding: 16px; font-family: 'JetBrains Mono', 'Fira Code', monospace !important; font-size: 0.85em; line-height: 1.6; max-height: 400px; overflow-y: auto; } /* Status badges */ .status-excellent { color: #4ade80; font-weight: 600; } .status-good { color: #60a5fa; font-weight: 600; } .status-fair { color: #fbbf24; font-weight: 600; } .status-poor { color: #f87171; font-weight: 600; } /* Confidence bar */ .confidence-bar { height: 8px; border-radius: 4px; background: #1e293b; overflow: hidden; margin: 8px 0; } .confidence-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; } /* Card styling */ .settings-card { background: linear-gradient(135deg, #1e293b, #0f172a); border: 1px solid rgba(99, 102, 241, 0.2); border-radius: 12px; padding: 16px; } /* Preview area */ .preview-area { background: #ffffff; color: #1e293b; border-radius: 12px; padding: 20px; min-height: 200px; border: 1px solid #e2e8f0; } /* Tab styling override */ .tab-nav button { font-weight: 500 !important; font-size: 0.9em !important; } """ # ─── Processing Function ────────────────────────────────────────────────────── def process_image(image_path, api_key, llm_provider, progress=gr.Progress()): """Main processing function invoked by the UI.""" if image_path is None: return ( None, '

Please upload an image first.

', "", "", "" ) progress(0.1, desc="Agent: Perceiving image...") # Determine provider provider = "none" key = api_key.strip() if api_key else "" if llm_provider == "Google Gemini" and key: provider = "gemini" elif llm_provider == "Ollama (Local)" and key: provider = "ollama" elif llm_provider == "Ollama (Local)" and not key: provider = "ollama" key = "http://localhost:11434" try: progress(0.2, desc="Agent: Analyzing image properties...") # Run the full agentic workflow result = run_agent(image_path, api_key=key, llm_provider=provider) progress(0.9, desc="Agent: Finalizing...") # Extract results error = result.get("error") if error: return ( None, f'

{error}

', "", _format_agent_log(result.get("processing_log", [])), "" ) docx_path = result.get("docx_path", "") preview_html = result.get("preview_html", "") raw_text = result.get("raw_text", "") confidence = result.get("confidence_score", 0) quality = result.get("quality_assessment", "unknown") agent_log = _format_agent_log(result.get("processing_log", [])) # Build status HTML status_html = _build_status_html(confidence, quality, result) progress(1.0, desc="✅ Complete!") return ( docx_path, preview_html, raw_text, agent_log, status_html ) except Exception as e: logger.error(f"Processing error: {e}", exc_info=True) return ( None, f'

❌ Error: {str(e)}

', "", f"Error: {str(e)}", "" ) def _build_status_html(confidence, quality, result): """Build status display HTML.""" conf_pct = int(confidence * 100) color = ( "#4ade80" if confidence >= 0.8 else "#60a5fa" if confidence >= 0.7 else "#fbbf24" if confidence >= 0.5 else "#f87171" ) engine = result.get("ocr_result", {}).get("engine_used", "Unknown") num_paras = len(result.get("layout_analysis", {}).get("paragraphs", [])) props = result.get("image_properties", {}) html = f"""

{'Good' if confidence >= 0.7 else 'Bad' if confidence >= 0.5 else 'Failed'}

{quality.upper()} Quality — {conf_pct}% Confidence

OCR Engine: {engine} | Paragraphs: {num_paras}

Resolution: {props.get('width', '?')}×{props.get('height', '?')}

DPI: {props.get('resolution_dpi', '?')}

Contrast: {props.get('contrast_score', 0):.0%}

Noise: {props.get('noise_level', 0):.0%}

⚠️ Low confidence — consider reviewing the extracted text before downloading.

' if result.get("needs_human_review") else ''}

""" return html def _format_agent_log(log): """Format agent processing log as readable text.""" if not log: return "No processing log available." lines = [] step_icons = { "perceive": "", "analyze": "", "decide": "", "act": "", "learn": "", } for entry in log: step = entry.get("step", "") icon = step_icons.get(step, "▸") action = entry.get("action", "") detail = entry.get("detail", "") ts = entry.get("timestamp", "") time_str = "" if ts: try: dt = datetime.fromisoformat(ts) time_str = f"[{dt.strftime('%H:%M:%S')}] " except Exception: pass line = f"{icon} {time_str}{action}" if detail: line += f"\n └─ {detail}" lines.append(line) return "\n".join(lines) def get_history_html(): """Get processing history as formatted HTML.""" stats = memory.get_stats() history = memory.get_history(10) if stats["total_processed"] == 0: return '

No processing history yet. Convert an image to start building memory.

' html = f"""

{stats['total_processed']}

Documents Processed

{stats['avg_confidence']:.0%}

Avg Confidence

{len(stats.get('quality_distribution', {}))}

Quality Levels

""" if history: html += '

Recent Activity

' for record in reversed(history[-5:]): ts = record.get("timestamp", record.get("saved_at", "")) conf = record.get("confidence_score", 0) quality = record.get("quality_assessment", "?") engine = record.get("ocr_engine", "?") n_paras = record.get("num_paragraphs", 0) html += f"""

{engine} — {n_paras} paragraphs {conf:.0%} confidence

{ts}

""" html += '

' return html # ─── Build Gradio UI ────────────────────────────────────────────────────────── def build_ui(): """Build the Gradio 6 application.""" with gr.Blocks( title="Agentic Image2Word — AI-Powered OCR Converter", ) as app: # Header gr.HTML(""" """) with gr.Tabs() as tabs: # ── Tab 1: Converter ────────────────────────────────────────── with gr.TabItem("Convert", id="convert-tab"): with gr.Row(equal_height=False): # Left Column — Input & Settings with gr.Column(scale=1): img_input = gr.Image( type="filepath", label="Upload Image", height=300, sources=["upload", "clipboard"], ) with gr.Accordion("⚙️ AI Settings", open=False): llm_provider = gr.Dropdown( choices=["None (Rule-Based)", "Google Gemini", "Ollama (Local)"], value="None (Rule-Based)", label="Intelligence Provider", info="Select LLM for enhanced formatting", ) api_key = gr.Textbox( label="API Key / Ollama URL", placeholder="Enter Gemini API key or Ollama URL...", type="password", info="Gemini: paste API key | Ollama: http://localhost:11434", ) btn_convert = gr.Button( "Convert with Agent", variant="primary", size="lg", ) # Status display status_html = gr.HTML( value='

Upload an image and click Convert to start.

', label="Agent Status", ) # Right Column — Output with gr.Column(scale=1): with gr.Tabs(): with gr.TabItem("Preview"): preview_output = gr.HTML( value='

Document preview will appear here...

', label="Document Preview", ) with gr.TabItem("Raw Text"): text_output = gr.Textbox( label="Extracted Text (Editable)", lines=12, interactive=True, placeholder="Extracted text will appear here. You can edit before downloading...", info="Edit text here before generating the final document", ) with gr.TabItem("Agent Log"): agent_log = gr.Textbox( label="Agent Decision Log", lines=12, interactive=False, placeholder="Agent processing steps will appear here...", ) file_output = gr.File( label="📥 Download Word Document", interactive=False, ) # Wire up conversion btn_convert.click( fn=process_image, inputs=[img_input, api_key, llm_provider], outputs=[file_output, preview_output, text_output, agent_log, status_html], ) # ── Tab 2: Memory & History ─────────────────────────────────── with gr.TabItem("History & Memory", id="history-tab"): gr.HTML("""

Agent Memory

The agent learns from each document it processes, adapting its preprocessing and formatting decisions based on past results.

""") history_display = gr.HTML(value=get_history_html()) btn_refresh = gr.Button("Refresh History", size="sm") btn_refresh.click(fn=get_history_html, outputs=[history_display]) # ── Tab 3: About ────────────────────────────────────────────── with gr.TabItem("About", id="about-tab"): gr.HTML("""

Agentic Image2Word Converter

This application transforms scanned documents and images into formatted Word documents using an agentic AI architecture.

Architecture

Perceive

Analyze image

Analyze

Run OCR

Decide

Format text

Act

Generate DOCX

Learn

Save memory

Technology Stack

OCR Engine	docTR (Deep Learning) + Tesseract fallback
Agentic Framework	LangGraph (Stateful AI Workflows)
LLM Intelligence	Google Gemini / Ollama (Optional)
Web UI	Gradio 6
Document Output	python-docx (.docx)

Ethical Design

Privacy: All processing is done server-side; no data is shared externally.
Transparency: Full agent decision log visible in real-time.
User Control: Human-in-the-loop editing before final document generation.
Safety: Confidence scoring and quality alerts for low-quality results.

""") return app # ─── Entry Point ─────────────────────────────────────────────────────────────── if __name__ == "__main__": app = build_ui() app.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, theme=gr.themes.Soft( primary_hue="indigo", secondary_hue="slate", neutral_hue="slate", ), css=CUSTOM_CSS, )

RayXar - Image2Word Converter

Agent Memory

Agentic Image2Word Converter

Architecture

Technology Stack

Ethical Design