import gradio as gr import tempfile import time from pathlib import Path # ── Docling ──────────────────────────────────────────────────────────────────── from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions from docling.datamodel.base_models import InputFormat from docling.document_converter import ImageFormatOption from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend MAX_PDF_PAGES = 40 SUPPORTED_IMAGE = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"] SUPPORTED_DOCS = [".pdf", ".docx", ".pptx", ".html", ".htm", ".xlsx"] # ── Build converter once at startup ─────────────────────────────────────────── def _build(): ocr = EasyOcrOptions(force_full_page_ocr=True) pdf = PdfPipelineOptions() pdf.do_ocr = True pdf.ocr_options = ocr pdf.do_table_structure = True pdf.table_structure_options.do_cell_matching = True return DocumentConverter( allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.PPTX, InputFormat.HTML, InputFormat.XLSX], format_options={ InputFormat.PDF: PdfFormatOption( pipeline_cls=StandardPdfPipeline, pipeline_options=pdf, backend=PyPdfiumDocumentBackend, ), InputFormat.IMAGE: ImageFormatOption(pipeline_cls=SimplePipeline), }, ) CONVERTER = _build() # ── Helpers ──────────────────────────────────────────────────────────────────── def _pdf_pages(path): try: import pypdfium2 as pdfium d = pdfium.PdfDocument(path) n = len(d); d.close(); return n except Exception: return 0 def _save_md(text): t = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") t.write(text); t.close() return t.name # ── Core logic ───────────────────────────────────────────────────────────────── def convert_file(file_path: str): """ API endpoint: upload a file → get Markdown back. Returns (markdown, status_html, download_path). """ if not file_path: return "", _status("", ""), None ext = Path(file_path).suffix.lower() if ext == ".pdf": n = _pdf_pages(file_path) if n > MAX_PDF_PAGES: msg = f"PDF has {n} pages — limit is {MAX_PDF_PAGES}." return "", _status("error", msg), None if ext not in SUPPORTED_IMAGE + SUPPORTED_DOCS: msg = f"Unsupported type: {ext}" return "", _status("error", msg), None try: t0 = time.time() result = CONVERTER.convert(file_path) elapsed = time.time() - t0 md = result.document.export_to_markdown() if not md.strip(): return "", _status("warn", "No text extracted."), None words = len(md.split()) msg = f"Done in {elapsed:.1f}s  ·  {words:,} words  ·  {len(md):,} chars" return md, _status("ok", msg), _save_md(md) except Exception as e: return "", _status("error", str(e)), None def _status(kind, msg): colors = {"ok": "#22863a", "error": "#cb2431", "warn": "#b08800", "": "#666"} c = colors.get(kind, "#666") icons = {"ok": "✓", "error": "✗", "warn": "⚠", "": ""} i = icons.get(kind, "") if not msg: return "" return f'

{i} {msg}

' def regen_download(md): if md and md.strip(): return _save_md(md) return None # ── CSS ──────────────────────────────────────────────────────────────────────── CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&family=JetBrains+Mono:wght@400;500&display=swap'); :root { --bg: #fafafa; --card: #ffffff; --border: #e4e4e7; --text: #09090b; --muted: #71717a; --accent: #18181b; --blue: #2563eb; --r: 10px; --sans: 'Inter', system-ui, sans-serif; --mono: 'JetBrains Mono', monospace; } /* ── Reset & base ── */ *, *::before, *::after { box-sizing: border-box; } html, body { margin: 0; padding: 0; } body, .gradio-container, .svelte-1kyws56 { background: var(--bg) !important; font-family: var(--sans) !important; color: var(--text) !important; } .gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 0 16px 48px !important; } footer, .built-with, #footer { display: none !important; } /* ── Header ── */ #docling-header { padding: 32px 4px 24px; border-bottom: 1px solid var(--border); margin-bottom: 24px; } #docling-header h1 { font-size: 1.25rem; font-weight: 600; color: var(--text); margin: 0 0 4px; letter-spacing: -0.025em; font-family: var(--sans); } #docling-header p { font-size: 0.85rem; color: var(--muted); margin: 0; font-weight: 400; font-family: var(--sans); } /* ── Cards ── */ .card { background: var(--card); border: 1px solid var(--border); border-radius: var(--r); padding: 20px; } /* ── Upload ── */ #upload-box { margin-bottom: 0 !important; } #upload-box > .wrap { border: 2px dashed var(--border) !important; border-radius: var(--r) !important; background: var(--bg) !important; transition: border-color 0.15s ease, background 0.15s ease !important; padding: 28px 16px !important; cursor: pointer !important; } #upload-box > .wrap:hover { border-color: var(--blue) !important; background: #eff6ff !important; } #upload-box .wrap svg { color: var(--muted) !important; } #upload-box .wrap .text-center span { font-family: var(--sans) !important; font-size: 0.82rem !important; color: var(--muted) !important; } #upload-box label.svelte-1b6s6g { display: none !important; } /* ── Primary button ── */ #btn-convert { background: var(--accent) !important; color: #fff !important; border: none !important; border-radius: var(--r) !important; font-family: var(--sans) !important; font-size: 0.875rem !important; font-weight: 500 !important; height: 42px !important; width: 100% !important; cursor: pointer !important; transition: background 0.15s ease !important; letter-spacing: -0.01em !important; } #btn-convert:hover { background: #3f3f46 !important; } #btn-convert:active { background: #52525b !important; } /* ── Ghost buttons ── */ #btn-clear, #btn-copy, #btn-dl button, #btn-dl a { background: transparent !important; color: var(--muted) !important; border: 1px solid var(--border) !important; border-radius: var(--r) !important; font-family: var(--sans) !important; font-size: 0.8rem !important; font-weight: 400 !important; cursor: pointer !important; transition: border-color 0.15s, color 0.15s, background 0.15s !important; text-decoration: none !important; display: inline-flex !important; align-items: center !important; gap: 5px !important; padding: 8px 14px !important; height: 36px !important; white-space: nowrap !important; } #btn-clear:hover { border-color: #a1a1aa !important; color: var(--text) !important; } #btn-copy:hover { border-color: var(--blue) !important; color: var(--blue) !important; } #btn-dl button:hover, #btn-dl a:hover { border-color: var(--blue) !important; color: var(--blue) !important; background: #eff6ff !important; } #btn-clear { width: 100% !important; } /* ── Status ── */ #status-box > div { padding: 0 !important; } #status-box > div > div { background: transparent !important; border: none !important; padding: 0 !important; } /* ── Formats chip list ── */ #fmt-list { padding: 0 !important; background: transparent !important; border: none !important; } #fmt-list > div { background: transparent !important; border: none !important; padding: 0 !important; } #fmt-list p { font-family: var(--sans) !important; font-size: 0.78rem !important; color: var(--muted) !important; line-height: 1.9 !important; margin: 0 !important; } #fmt-list strong { color: var(--text) !important; font-weight: 500 !important; } #fmt-list hr { border-color: var(--border) !important; margin: 10px 0 !important; } #fmt-list em { font-style: normal !important; color: #a1a1aa !important; } /* ── Editor ── */ #md-editor > label { font-family: var(--sans) !important; font-size: 0.72rem !important; font-weight: 600 !important; text-transform: uppercase !important; letter-spacing: 0.08em !important; color: var(--muted) !important; } #md-editor textarea { font-family: var(--mono) !important; font-size: 0.83rem !important; line-height: 1.7 !important; color: var(--text) !important; background: var(--card) !important; border: 1px solid var(--border) !important; border-radius: var(--r) !important; padding: 18px 20px !important; resize: vertical !important; transition: border-color 0.15s, box-shadow 0.15s !important; caret-color: var(--blue) !important; } #md-editor textarea:focus { border-color: #93c5fd !important; outline: none !important; box-shadow: 0 0 0 3px rgba(37,99,235,0.08) !important; } #md-editor textarea::placeholder { color: #d4d4d8 !important; font-style: italic !important; } /* ── Toolbar row ── */ #toolbar-row { display: flex !important; align-items: center !important; justify-content: space-between !important; flex-wrap: wrap !important; gap: 8px !important; margin-bottom: 8px !important; } #toolbar-row > * { flex-shrink: 0 !important; } #toolbar-label { font-family: var(--sans) !important; font-size: 0.72rem !important; font-weight: 600 !important; text-transform: uppercase !important; letter-spacing: 0.08em !important; color: var(--muted) !important; } #toolbar-actions { display: flex !important; gap: 8px !important; align-items: center !important; } /* ── Remove extra Gradio chrome ── */ .gap-4 { gap: 1rem !important; } .gr-form { box-shadow: none !important; } div.svelte-vt7nkj { gap: 0 !important; } /* ── Responsive ── */ @media (max-width: 768px) { .gradio-container { padding: 0 12px 32px !important; } #docling-header { padding: 20px 4px 16px; margin-bottom: 16px; } #docling-header h1 { font-size: 1.05rem; } } """ # ── JS ───────────────────────────────────────────────────────────────────────── COPY_JS = """ () => { const ta = document.querySelector('#md-editor textarea'); if (!ta || !ta.value.trim()) return; navigator.clipboard.writeText(ta.value).then(() => { const btn = document.querySelector('#btn-copy'); if (!btn) return; const orig = btn.innerHTML; btn.innerHTML = '✓ Copied!'; btn.style.borderColor = '#16a34a'; btn.style.color = '#16a34a'; setTimeout(() => { btn.innerHTML = orig; btn.style.borderColor = ''; btn.style.color = ''; }, 2000); }); } """ # ── UI ───────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Docling · OCR → Markdown") as demo: gr.HTML("""

📄 Docling OCR → Markdown

Convert images & documents to clean, structured Markdown using OCR. Edit and download the result.

""") with gr.Row(equal_height=False): # ── Left column ─────────────────────────────────────────────────────── with gr.Column(scale=1, min_width=260): file_input = gr.File( label="", elem_id="upload-box", file_types=[ ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".htm", ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ], type="filepath", height=160, ) btn_convert = gr.Button( "⚡ Convert to Markdown", variant="primary", elem_id="btn-convert", ) status_out = gr.HTML(value="", elem_id="status-box") btn_clear = gr.Button("✕ Clear", elem_id="btn-clear", size="sm") gr.Markdown( "**Supported formats**\n\n" "🖼  JPG · PNG · TIFF · WebP · BMP\n\n" "📄  PDF *(max 20 pages)*\n\n" "📝  DOCX · PPTX · XLSX\n\n" "🌐  HTML\n\n" "---\n" "*OCR applied automatically*", elem_id="fmt-list", ) # ── Right column ────────────────────────────────────────────────────── with gr.Column(scale=3, min_width=400): # Toolbar with gr.Row(): gr.HTML('Markdown Editor') with gr.Row(): btn_copy = gr.Button("⎘ Copy", elem_id="btn-copy", size="sm") btn_dl = gr.DownloadButton( label="↓ Download .md", value=None, size="sm", elem_id="btn-dl", ) md_editor = gr.Textbox( value="", label="", placeholder="Your converted Markdown will appear here.\nYou can edit it freely before copying or downloading.", lines=28, max_lines=80, interactive=True, elem_id="md-editor", show_label=False, ) # ── Wiring ──────────────────────────────────────────────────────────────── def run(file): md, status, dl = convert_file(file) return md, status, dl btn_convert.click( fn=run, inputs=[file_input], outputs=[md_editor, status_out, btn_dl], api_name="convert", ) md_editor.change( fn=regen_download, inputs=[md_editor], outputs=[btn_dl], api_name=False, ) btn_copy.click(fn=None, js=COPY_JS) def do_clear(): return None, "", "", None btn_clear.click( fn=do_clear, inputs=[], outputs=[file_input, md_editor, status_out, btn_dl], ) if __name__ == "__main__": demo.launch(css=CSS)