Spaces:
Sleeping
Sleeping
| """ | |
| Agentic Image-to-Word Converter β Gradio 6 Web Application | |
| A modern agentic OCR system that converts images to formatted Word documents | |
| using docTR, LangGraph, and optional LLM intelligence. | |
| """ | |
| import os | |
| import sys | |
| import logging | |
| import gradio as gr | |
| from datetime import datetime | |
| # Ensure project root is in path | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from agents.graph import run_agent | |
| from memory.store import MemoryStore | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| memory = MemoryStore() | |
| # βββ Custom CSS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
| * { font-family: 'Inter', sans-serif !important; } | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: auto !important; | |
| } | |
| /* Header banner */ | |
| .header-banner { | |
| background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #0f172a 100%); | |
| border: 1px solid rgba(99, 102, 241, 0.3); | |
| border-radius: 16px; | |
| padding: 24px 32px; | |
| margin-bottom: 16px; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .header-banner::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| left: -50%; | |
| width: 200%; | |
| height: 200%; | |
| background: radial-gradient(circle at 30% 50%, rgba(99, 102, 241, 0.08) 0%, transparent 50%); | |
| animation: pulse-glow 4s ease-in-out infinite; | |
| } | |
| @keyframes pulse-glow { | |
| 0%, 100% { opacity: 0.5; } | |
| 50% { opacity: 1; } | |
| } | |
| .header-banner h1 { | |
| color: #e2e8f0 !important; | |
| font-size: 1.8em !important; | |
| font-weight: 700 !important; | |
| margin: 0 !important; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .header-banner p { | |
| color: #94a3b8 !important; | |
| font-size: 0.95em !important; | |
| margin: 4px 0 0 !important; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| /* Agent log styling */ | |
| .agent-log { | |
| background: #0f172a; | |
| border: 1px solid #1e293b; | |
| border-radius: 12px; | |
| padding: 16px; | |
| font-family: 'JetBrains Mono', 'Fira Code', monospace !important; | |
| font-size: 0.85em; | |
| line-height: 1.6; | |
| max-height: 400px; | |
| overflow-y: auto; | |
| } | |
| /* Status badges */ | |
| .status-excellent { color: #4ade80; font-weight: 600; } | |
| .status-good { color: #60a5fa; font-weight: 600; } | |
| .status-fair { color: #fbbf24; font-weight: 600; } | |
| .status-poor { color: #f87171; font-weight: 600; } | |
| /* Confidence bar */ | |
| .confidence-bar { | |
| height: 8px; | |
| border-radius: 4px; | |
| background: #1e293b; | |
| overflow: hidden; | |
| margin: 8px 0; | |
| } | |
| .confidence-fill { | |
| height: 100%; | |
| border-radius: 4px; | |
| transition: width 0.5s ease; | |
| } | |
| /* Card styling */ | |
| .settings-card { | |
| background: linear-gradient(135deg, #1e293b, #0f172a); | |
| border: 1px solid rgba(99, 102, 241, 0.2); | |
| border-radius: 12px; | |
| padding: 16px; | |
| } | |
| /* Preview area */ | |
| .preview-area { | |
| background: #ffffff; | |
| color: #1e293b; | |
| border-radius: 12px; | |
| padding: 20px; | |
| min-height: 200px; | |
| border: 1px solid #e2e8f0; | |
| } | |
| /* Tab styling override */ | |
| .tab-nav button { | |
| font-weight: 500 !important; | |
| font-size: 0.9em !important; | |
| } | |
| """ | |
| # βββ Processing Function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_image(image_path, api_key, llm_provider, progress=gr.Progress()): | |
| """Main processing function invoked by the UI.""" | |
| if image_path is None: | |
| return ( | |
| None, | |
| '<div style="color: #f87171; padding: 20px;"> Please upload an image first.</div>', | |
| "", | |
| "", | |
| "" | |
| ) | |
| progress(0.1, desc="Agent: Perceiving image...") | |
| # Determine provider | |
| provider = "none" | |
| key = api_key.strip() if api_key else "" | |
| if llm_provider == "Google Gemini" and key: | |
| provider = "gemini" | |
| elif llm_provider == "Ollama (Local)" and key: | |
| provider = "ollama" | |
| elif llm_provider == "Ollama (Local)" and not key: | |
| provider = "ollama" | |
| key = "http://localhost:11434" | |
| try: | |
| progress(0.2, desc="Agent: Analyzing image properties...") | |
| # Run the full agentic workflow | |
| result = run_agent(image_path, api_key=key, llm_provider=provider) | |
| progress(0.9, desc="Agent: Finalizing...") | |
| # Extract results | |
| error = result.get("error") | |
| if error: | |
| return ( | |
| None, | |
| f'<div style="color: #f87171; padding: 20px;"> {error}</div>', | |
| "", | |
| _format_agent_log(result.get("processing_log", [])), | |
| "" | |
| ) | |
| docx_path = result.get("docx_path", "") | |
| preview_html = result.get("preview_html", "") | |
| raw_text = result.get("raw_text", "") | |
| confidence = result.get("confidence_score", 0) | |
| quality = result.get("quality_assessment", "unknown") | |
| agent_log = _format_agent_log(result.get("processing_log", [])) | |
| # Build status HTML | |
| status_html = _build_status_html(confidence, quality, result) | |
| progress(1.0, desc="β Complete!") | |
| return ( | |
| docx_path, | |
| preview_html, | |
| raw_text, | |
| agent_log, | |
| status_html | |
| ) | |
| except Exception as e: | |
| logger.error(f"Processing error: {e}", exc_info=True) | |
| return ( | |
| None, | |
| f'<div style="color: #f87171; padding: 20px;">β Error: {str(e)}</div>', | |
| "", | |
| f"Error: {str(e)}", | |
| "" | |
| ) | |
| def _build_status_html(confidence, quality, result): | |
| """Build status display HTML.""" | |
| conf_pct = int(confidence * 100) | |
| color = ( | |
| "#4ade80" if confidence >= 0.8 | |
| else "#60a5fa" if confidence >= 0.7 | |
| else "#fbbf24" if confidence >= 0.5 | |
| else "#f87171" | |
| ) | |
| engine = result.get("ocr_result", {}).get("engine_used", "Unknown") | |
| num_paras = len(result.get("layout_analysis", {}).get("paragraphs", [])) | |
| props = result.get("image_properties", {}) | |
| html = f""" | |
| <div style="padding: 16px; font-family: Inter, sans-serif;"> | |
| <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 12px;"> | |
| <span style="font-size: 1.5em;">{'Good' if confidence >= 0.7 else 'Bad' if confidence >= 0.5 else 'Failed'}</span> | |
| <div> | |
| <div style="font-weight: 600; font-size: 1.1em; color: {color};"> | |
| {quality.upper()} Quality β {conf_pct}% Confidence | |
| </div> | |
| <div style="color: #94a3b8; font-size: 0.85em;"> | |
| OCR Engine: {engine} | Paragraphs: {num_paras} | |
| </div> | |
| </div> | |
| </div> | |
| <div class="confidence-bar"> | |
| <div class="confidence-fill" style="width: {conf_pct}%; background: {color};"></div> | |
| </div> | |
| <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 8px; margin-top: 12px; font-size: 0.85em; color: #94a3b8;"> | |
| <div>Resolution: {props.get('width', '?')}Γ{props.get('height', '?')}</div> | |
| <div>DPI: {props.get('resolution_dpi', '?')}</div> | |
| <div>Contrast: {props.get('contrast_score', 0):.0%}</div> | |
| <div>Noise: {props.get('noise_level', 0):.0%}</div> | |
| </div> | |
| {'<div style="margin-top: 12px; padding: 8px; background: rgba(251, 191, 36, 0.1); border-radius: 8px; color: #fbbf24; font-size: 0.85em;">β οΈ Low confidence β consider reviewing the extracted text before downloading.</div>' if result.get("needs_human_review") else ''} | |
| </div> | |
| """ | |
| return html | |
| def _format_agent_log(log): | |
| """Format agent processing log as readable text.""" | |
| if not log: | |
| return "No processing log available." | |
| lines = [] | |
| step_icons = { | |
| "perceive": "", | |
| "analyze": "", | |
| "decide": "", | |
| "act": "", | |
| "learn": "", | |
| } | |
| for entry in log: | |
| step = entry.get("step", "") | |
| icon = step_icons.get(step, "βΈ") | |
| action = entry.get("action", "") | |
| detail = entry.get("detail", "") | |
| ts = entry.get("timestamp", "") | |
| time_str = "" | |
| if ts: | |
| try: | |
| dt = datetime.fromisoformat(ts) | |
| time_str = f"[{dt.strftime('%H:%M:%S')}] " | |
| except Exception: | |
| pass | |
| line = f"{icon} {time_str}{action}" | |
| if detail: | |
| line += f"\n ββ {detail}" | |
| lines.append(line) | |
| return "\n".join(lines) | |
| def get_history_html(): | |
| """Get processing history as formatted HTML.""" | |
| stats = memory.get_stats() | |
| history = memory.get_history(10) | |
| if stats["total_processed"] == 0: | |
| return '<div style="padding: 20px; color: #94a3b8; text-align: center;">No processing history yet. Convert an image to start building memory.</div>' | |
| html = f""" | |
| <div style="padding: 16px;"> | |
| <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; margin-bottom: 16px;"> | |
| <div style="background: rgba(99, 102, 241, 0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em; font-weight: 700; color: #818cf8;">{stats['total_processed']}</div> | |
| <div style="color: #94a3b8; font-size: 0.85em;">Documents Processed</div> | |
| </div> | |
| <div style="background: rgba(74, 222, 128, 0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em; font-weight: 700; color: #4ade80;">{stats['avg_confidence']:.0%}</div> | |
| <div style="color: #94a3b8; font-size: 0.85em;">Avg Confidence</div> | |
| </div> | |
| <div style="background: rgba(96, 165, 250, 0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em; font-weight: 700; color: #60a5fa;">{len(stats.get('quality_distribution', {}))}</div> | |
| <div style="color: #94a3b8; font-size: 0.85em;">Quality Levels</div> | |
| </div> | |
| </div> | |
| """ | |
| if history: | |
| html += '<div style="font-weight: 600; margin-bottom: 8px; color: #e2e8f0;">Recent Activity</div>' | |
| for record in reversed(history[-5:]): | |
| ts = record.get("timestamp", record.get("saved_at", "")) | |
| conf = record.get("confidence_score", 0) | |
| quality = record.get("quality_assessment", "?") | |
| engine = record.get("ocr_engine", "?") | |
| n_paras = record.get("num_paragraphs", 0) | |
| html += f""" | |
| <div style="border: 1px solid #1e293b; border-radius: 8px; padding: 10px; margin-bottom: 6px; font-size: 0.85em;"> | |
| <div style="display: flex; justify-content: space-between;"> | |
| <span style="color: #e2e8f0;">{engine} β {n_paras} paragraphs</span> | |
| <span style="color: #94a3b8;">{conf:.0%} confidence</span> | |
| </div> | |
| <div style="color: #64748b; font-size: 0.8em;">{ts}</div> | |
| </div> | |
| """ | |
| html += '</div>' | |
| return html | |
| # βββ Build Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_ui(): | |
| """Build the Gradio 6 application.""" | |
| with gr.Blocks( | |
| title="Agentic Image2Word β AI-Powered OCR Converter", | |
| ) as app: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-banner"> | |
| <h1>RayXar - Image2Word Converter</h1> | |
| <p>AI-powered document conversion with adaptive OCR, intelligent formatting, and learning memory</p> | |
| </div> | |
| """) | |
| with gr.Tabs() as tabs: | |
| # ββ Tab 1: Converter ββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Convert", id="convert-tab"): | |
| with gr.Row(equal_height=False): | |
| # Left Column β Input & Settings | |
| with gr.Column(scale=1): | |
| img_input = gr.Image( | |
| type="filepath", | |
| label="Upload Image", | |
| height=300, | |
| sources=["upload", "clipboard"], | |
| ) | |
| with gr.Accordion("βοΈ AI Settings", open=False): | |
| llm_provider = gr.Dropdown( | |
| choices=["None (Rule-Based)", "Google Gemini", "Ollama (Local)"], | |
| value="None (Rule-Based)", | |
| label="Intelligence Provider", | |
| info="Select LLM for enhanced formatting", | |
| ) | |
| api_key = gr.Textbox( | |
| label="API Key / Ollama URL", | |
| placeholder="Enter Gemini API key or Ollama URL...", | |
| type="password", | |
| info="Gemini: paste API key | Ollama: http://localhost:11434", | |
| ) | |
| btn_convert = gr.Button( | |
| "Convert with Agent", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| # Status display | |
| status_html = gr.HTML( | |
| value='<div style="padding: 16px; color: #94a3b8; text-align: center;">Upload an image and click Convert to start.</div>', | |
| label="Agent Status", | |
| ) | |
| # Right Column β Output | |
| with gr.Column(scale=1): | |
| with gr.Tabs(): | |
| with gr.TabItem("Preview"): | |
| preview_output = gr.HTML( | |
| value='<div style="padding: 40px; color: #94a3b8; text-align: center; font-style: italic;">Document preview will appear here...</div>', | |
| label="Document Preview", | |
| ) | |
| with gr.TabItem("Raw Text"): | |
| text_output = gr.Textbox( | |
| label="Extracted Text (Editable)", | |
| lines=12, | |
| interactive=True, | |
| placeholder="Extracted text will appear here. You can edit before downloading...", | |
| info="Edit text here before generating the final document", | |
| ) | |
| with gr.TabItem("Agent Log"): | |
| agent_log = gr.Textbox( | |
| label="Agent Decision Log", | |
| lines=12, | |
| interactive=False, | |
| placeholder="Agent processing steps will appear here...", | |
| ) | |
| file_output = gr.File( | |
| label="π₯ Download Word Document", | |
| interactive=False, | |
| ) | |
| # Wire up conversion | |
| btn_convert.click( | |
| fn=process_image, | |
| inputs=[img_input, api_key, llm_provider], | |
| outputs=[file_output, preview_output, text_output, agent_log, status_html], | |
| ) | |
| # ββ Tab 2: Memory & History βββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("History & Memory", id="history-tab"): | |
| gr.HTML(""" | |
| <div style="padding: 12px 0;"> | |
| <h3 style="margin: 0; color: #e2e8f0;">Agent Memory</h3> | |
| <p style="color: #94a3b8; margin: 4px 0 0;"> | |
| The agent learns from each document it processes, adapting its preprocessing | |
| and formatting decisions based on past results. | |
| </p> | |
| </div> | |
| """) | |
| history_display = gr.HTML(value=get_history_html()) | |
| btn_refresh = gr.Button("Refresh History", size="sm") | |
| btn_refresh.click(fn=get_history_html, outputs=[history_display]) | |
| # ββ Tab 3: About ββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("About", id="about-tab"): | |
| gr.HTML(""" | |
| <div style="padding: 20px; line-height: 1.8;"> | |
| <h2 style="color: #e2e8f0;">Agentic Image2Word Converter</h2> | |
| <p style="color: #94a3b8;"> | |
| This application transforms scanned documents and images into formatted | |
| Word documents using an <b>agentic AI architecture</b>. | |
| </p> | |
| <h3 style="color: #818cf8; margin-top: 20px;">Architecture</h3> | |
| <div style="display: grid; grid-template-columns: repeat(5, 1fr); gap: 8px; margin: 12px 0;"> | |
| <div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em;"></div> | |
| <div style="font-weight: 600; color: #e2e8f0;">Perceive</div> | |
| <div style="font-size: 0.75em; color: #94a3b8;">Analyze image</div> | |
| </div> | |
| <div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em;"></div> | |
| <div style="font-weight: 600; color: #e2e8f0;">Analyze</div> | |
| <div style="font-size: 0.75em; color: #94a3b8;">Run OCR</div> | |
| </div> | |
| <div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em;"></div> | |
| <div style="font-weight: 600; color: #e2e8f0;">Decide</div> | |
| <div style="font-size: 0.75em; color: #94a3b8;">Format text</div> | |
| </div> | |
| <div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em;"></div> | |
| <div style="font-weight: 600; color: #e2e8f0;">Act</div> | |
| <div style="font-size: 0.75em; color: #94a3b8;">Generate DOCX</div> | |
| </div> | |
| <div style="background: rgba(99,102,241,0.1); padding: 12px; border-radius: 8px; text-align: center;"> | |
| <div style="font-size: 1.5em;"></div> | |
| <div style="font-weight: 600; color: #e2e8f0;">Learn</div> | |
| <div style="font-size: 0.75em; color: #94a3b8;">Save memory</div> | |
| </div> | |
| </div> | |
| <h3 style="color: #818cf8; margin-top: 20px;">Technology Stack</h3> | |
| <table style="width: 100%; color: #e2e8f0; border-collapse: collapse; margin: 12px 0;"> | |
| <tr style="border-bottom: 1px solid #1e293b;"> | |
| <td style="padding: 8px; font-weight: 600;">OCR Engine</td> | |
| <td style="padding: 8px; color: #94a3b8;">docTR (Deep Learning) + Tesseract fallback</td> | |
| </tr> | |
| <tr style="border-bottom: 1px solid #1e293b;"> | |
| <td style="padding: 8px; font-weight: 600;">Agentic Framework</td> | |
| <td style="padding: 8px; color: #94a3b8;">LangGraph (Stateful AI Workflows)</td> | |
| </tr> | |
| <tr style="border-bottom: 1px solid #1e293b;"> | |
| <td style="padding: 8px; font-weight: 600;">LLM Intelligence</td> | |
| <td style="padding: 8px; color: #94a3b8;">Google Gemini / Ollama (Optional)</td> | |
| </tr> | |
| <tr style="border-bottom: 1px solid #1e293b;"> | |
| <td style="padding: 8px; font-weight: 600;">Web UI</td> | |
| <td style="padding: 8px; color: #94a3b8;">Gradio 6</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; font-weight: 600;">Document Output</td> | |
| <td style="padding: 8px; color: #94a3b8;">python-docx (.docx)</td> | |
| </tr> | |
| </table> | |
| <h3 style="color: #818cf8; margin-top: 20px;">Ethical Design</h3> | |
| <ul style="color: #94a3b8;"> | |
| <li><b>Privacy:</b> All processing is done server-side; no data is shared externally.</li> | |
| <li><b>Transparency:</b> Full agent decision log visible in real-time.</li> | |
| <li><b>User Control:</b> Human-in-the-loop editing before final document generation.</li> | |
| <li><b>Safety:</b> Confidence scoring and quality alerts for low-quality results.</li> | |
| </ul> | |
| </div> | |
| """) | |
| return app | |
| # βββ Entry Point βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| app = build_ui() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| theme=gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="slate", | |
| neutral_hue="slate", | |
| ), | |
| css=CUSTOM_CSS, | |
| ) | |