Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| # ββ Docling ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.document_converter import ImageFormatOption | |
| from docling.pipeline.simple_pipeline import SimplePipeline | |
| from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline | |
| from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | |
| MAX_PDF_PAGES = 40 | |
| SUPPORTED_IMAGE = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"] | |
| SUPPORTED_DOCS = [".pdf", ".docx", ".pptx", ".html", ".htm", ".xlsx"] | |
| # ββ Build converter once at startup βββββββββββββββββββββββββββββββββββββββββββ | |
| def _build(): | |
| ocr = EasyOcrOptions(force_full_page_ocr=True) | |
| pdf = PdfPipelineOptions() | |
| pdf.do_ocr = True | |
| pdf.ocr_options = ocr | |
| pdf.do_table_structure = True | |
| pdf.table_structure_options.do_cell_matching = True | |
| return DocumentConverter( | |
| allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, | |
| InputFormat.DOCX, InputFormat.PPTX, | |
| InputFormat.HTML, InputFormat.XLSX], | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, | |
| pipeline_options=pdf, | |
| backend=PyPdfiumDocumentBackend, | |
| ), | |
| InputFormat.IMAGE: ImageFormatOption(pipeline_cls=SimplePipeline), | |
| }, | |
| ) | |
| CONVERTER = _build() | |
| # ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _pdf_pages(path): | |
| try: | |
| import pypdfium2 as pdfium | |
| d = pdfium.PdfDocument(path) | |
| n = len(d); d.close(); return n | |
| except Exception: | |
| return 0 | |
| def _save_md(text): | |
| t = tempfile.NamedTemporaryFile(delete=False, suffix=".md", | |
| mode="w", encoding="utf-8") | |
| t.write(text); t.close() | |
| return t.name | |
| # ββ Core logic βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def convert_file(file_path: str): | |
| """ | |
| API endpoint: upload a file β get Markdown back. | |
| Returns (markdown, status_html, download_path). | |
| """ | |
| if not file_path: | |
| return "", _status("", ""), None | |
| ext = Path(file_path).suffix.lower() | |
| if ext == ".pdf": | |
| n = _pdf_pages(file_path) | |
| if n > MAX_PDF_PAGES: | |
| msg = f"PDF has {n} pages β limit is {MAX_PDF_PAGES}." | |
| return "", _status("error", msg), None | |
| if ext not in SUPPORTED_IMAGE + SUPPORTED_DOCS: | |
| msg = f"Unsupported type: {ext}" | |
| return "", _status("error", msg), None | |
| try: | |
| t0 = time.time() | |
| result = CONVERTER.convert(file_path) | |
| elapsed = time.time() - t0 | |
| md = result.document.export_to_markdown() | |
| if not md.strip(): | |
| return "", _status("warn", "No text extracted."), None | |
| words = len(md.split()) | |
| msg = f"Done in {elapsed:.1f}s Β· {words:,} words Β· {len(md):,} chars" | |
| return md, _status("ok", msg), _save_md(md) | |
| except Exception as e: | |
| return "", _status("error", str(e)), None | |
| def _status(kind, msg): | |
| colors = {"ok": "#22863a", "error": "#cb2431", "warn": "#b08800", "": "#666"} | |
| c = colors.get(kind, "#666") | |
| icons = {"ok": "β", "error": "β", "warn": "β ", "": ""} | |
| i = icons.get(kind, "") | |
| if not msg: | |
| return "" | |
| return f'<p style="margin:0;font-size:0.8rem;font-family:monospace;color:{c}">{i} {msg}</p>' | |
| def regen_download(md): | |
| if md and md.strip(): | |
| return _save_md(md) | |
| return None | |
| # ββ CSS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&family=JetBrains+Mono:wght@400;500&display=swap'); | |
| :root { | |
| --bg: #fafafa; | |
| --card: #ffffff; | |
| --border: #e4e4e7; | |
| --text: #09090b; | |
| --muted: #71717a; | |
| --accent: #18181b; | |
| --blue: #2563eb; | |
| --r: 10px; | |
| --sans: 'Inter', system-ui, sans-serif; | |
| --mono: 'JetBrains Mono', monospace; | |
| } | |
| /* ββ Reset & base ββ */ | |
| *, *::before, *::after { box-sizing: border-box; } | |
| html, body { margin: 0; padding: 0; } | |
| body, .gradio-container, .svelte-1kyws56 { | |
| background: var(--bg) !important; | |
| font-family: var(--sans) !important; | |
| color: var(--text) !important; | |
| } | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| padding: 0 16px 48px !important; | |
| } | |
| footer, .built-with, #footer { display: none !important; } | |
| /* ββ Header ββ */ | |
| #docling-header { | |
| padding: 32px 4px 24px; | |
| border-bottom: 1px solid var(--border); | |
| margin-bottom: 24px; | |
| } | |
| #docling-header h1 { | |
| font-size: 1.25rem; | |
| font-weight: 600; | |
| color: var(--text); | |
| margin: 0 0 4px; | |
| letter-spacing: -0.025em; | |
| font-family: var(--sans); | |
| } | |
| #docling-header p { | |
| font-size: 0.85rem; | |
| color: var(--muted); | |
| margin: 0; | |
| font-weight: 400; | |
| font-family: var(--sans); | |
| } | |
| /* ββ Cards ββ */ | |
| .card { | |
| background: var(--card); | |
| border: 1px solid var(--border); | |
| border-radius: var(--r); | |
| padding: 20px; | |
| } | |
| /* ββ Upload ββ */ | |
| #upload-box { margin-bottom: 0 !important; } | |
| #upload-box > .wrap { | |
| border: 2px dashed var(--border) !important; | |
| border-radius: var(--r) !important; | |
| background: var(--bg) !important; | |
| transition: border-color 0.15s ease, background 0.15s ease !important; | |
| padding: 28px 16px !important; | |
| cursor: pointer !important; | |
| } | |
| #upload-box > .wrap:hover { | |
| border-color: var(--blue) !important; | |
| background: #eff6ff !important; | |
| } | |
| #upload-box .wrap svg { color: var(--muted) !important; } | |
| #upload-box .wrap .text-center span { | |
| font-family: var(--sans) !important; | |
| font-size: 0.82rem !important; | |
| color: var(--muted) !important; | |
| } | |
| #upload-box label.svelte-1b6s6g { display: none !important; } | |
| /* ββ Primary button ββ */ | |
| #btn-convert { | |
| background: var(--accent) !important; | |
| color: #fff !important; | |
| border: none !important; | |
| border-radius: var(--r) !important; | |
| font-family: var(--sans) !important; | |
| font-size: 0.875rem !important; | |
| font-weight: 500 !important; | |
| height: 42px !important; | |
| width: 100% !important; | |
| cursor: pointer !important; | |
| transition: background 0.15s ease !important; | |
| letter-spacing: -0.01em !important; | |
| } | |
| #btn-convert:hover { background: #3f3f46 !important; } | |
| #btn-convert:active { background: #52525b !important; } | |
| /* ββ Ghost buttons ββ */ | |
| #btn-clear, #btn-copy, #btn-dl button, #btn-dl a { | |
| background: transparent !important; | |
| color: var(--muted) !important; | |
| border: 1px solid var(--border) !important; | |
| border-radius: var(--r) !important; | |
| font-family: var(--sans) !important; | |
| font-size: 0.8rem !important; | |
| font-weight: 400 !important; | |
| cursor: pointer !important; | |
| transition: border-color 0.15s, color 0.15s, background 0.15s !important; | |
| text-decoration: none !important; | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| gap: 5px !important; | |
| padding: 8px 14px !important; | |
| height: 36px !important; | |
| white-space: nowrap !important; | |
| } | |
| #btn-clear:hover { border-color: #a1a1aa !important; color: var(--text) !important; } | |
| #btn-copy:hover { border-color: var(--blue) !important; color: var(--blue) !important; } | |
| #btn-dl button:hover, #btn-dl a:hover { | |
| border-color: var(--blue) !important; | |
| color: var(--blue) !important; | |
| background: #eff6ff !important; | |
| } | |
| #btn-clear { width: 100% !important; } | |
| /* ββ Status ββ */ | |
| #status-box > div { padding: 0 !important; } | |
| #status-box > div > div { background: transparent !important; border: none !important; padding: 0 !important; } | |
| /* ββ Formats chip list ββ */ | |
| #fmt-list { padding: 0 !important; background: transparent !important; border: none !important; } | |
| #fmt-list > div { background: transparent !important; border: none !important; padding: 0 !important; } | |
| #fmt-list p { | |
| font-family: var(--sans) !important; | |
| font-size: 0.78rem !important; | |
| color: var(--muted) !important; | |
| line-height: 1.9 !important; | |
| margin: 0 !important; | |
| } | |
| #fmt-list strong { color: var(--text) !important; font-weight: 500 !important; } | |
| #fmt-list hr { border-color: var(--border) !important; margin: 10px 0 !important; } | |
| #fmt-list em { font-style: normal !important; color: #a1a1aa !important; } | |
| /* ββ Editor ββ */ | |
| #md-editor > label { | |
| font-family: var(--sans) !important; | |
| font-size: 0.72rem !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.08em !important; | |
| color: var(--muted) !important; | |
| } | |
| #md-editor textarea { | |
| font-family: var(--mono) !important; | |
| font-size: 0.83rem !important; | |
| line-height: 1.7 !important; | |
| color: var(--text) !important; | |
| background: var(--card) !important; | |
| border: 1px solid var(--border) !important; | |
| border-radius: var(--r) !important; | |
| padding: 18px 20px !important; | |
| resize: vertical !important; | |
| transition: border-color 0.15s, box-shadow 0.15s !important; | |
| caret-color: var(--blue) !important; | |
| } | |
| #md-editor textarea:focus { | |
| border-color: #93c5fd !important; | |
| outline: none !important; | |
| box-shadow: 0 0 0 3px rgba(37,99,235,0.08) !important; | |
| } | |
| #md-editor textarea::placeholder { | |
| color: #d4d4d8 !important; | |
| font-style: italic !important; | |
| } | |
| /* ββ Toolbar row ββ */ | |
| #toolbar-row { | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: space-between !important; | |
| flex-wrap: wrap !important; | |
| gap: 8px !important; | |
| margin-bottom: 8px !important; | |
| } | |
| #toolbar-row > * { flex-shrink: 0 !important; } | |
| #toolbar-label { | |
| font-family: var(--sans) !important; | |
| font-size: 0.72rem !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.08em !important; | |
| color: var(--muted) !important; | |
| } | |
| #toolbar-actions { | |
| display: flex !important; | |
| gap: 8px !important; | |
| align-items: center !important; | |
| } | |
| /* ββ Remove extra Gradio chrome ββ */ | |
| .gap-4 { gap: 1rem !important; } | |
| .gr-form { box-shadow: none !important; } | |
| div.svelte-vt7nkj { gap: 0 !important; } | |
| /* ββ Responsive ββ */ | |
| @media (max-width: 768px) { | |
| .gradio-container { padding: 0 12px 32px !important; } | |
| #docling-header { padding: 20px 4px 16px; margin-bottom: 16px; } | |
| #docling-header h1 { font-size: 1.05rem; } | |
| } | |
| """ | |
| # ββ JS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| COPY_JS = """ | |
| () => { | |
| const ta = document.querySelector('#md-editor textarea'); | |
| if (!ta || !ta.value.trim()) return; | |
| navigator.clipboard.writeText(ta.value).then(() => { | |
| const btn = document.querySelector('#btn-copy'); | |
| if (!btn) return; | |
| const orig = btn.innerHTML; | |
| btn.innerHTML = 'β Copied!'; | |
| btn.style.borderColor = '#16a34a'; | |
| btn.style.color = '#16a34a'; | |
| setTimeout(() => { | |
| btn.innerHTML = orig; | |
| btn.style.borderColor = ''; | |
| btn.style.color = ''; | |
| }, 2000); | |
| }); | |
| } | |
| """ | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Docling Β· OCR β Markdown") as demo: | |
| gr.HTML(""" | |
| <div id="docling-header"> | |
| <h1>π Docling OCR β Markdown</h1> | |
| <p>Convert images & documents to clean, structured Markdown using OCR. Edit and download the result.</p> | |
| </div> | |
| """) | |
| with gr.Row(equal_height=False): | |
| # ββ Left column βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=1, min_width=260): | |
| file_input = gr.File( | |
| label="", | |
| elem_id="upload-box", | |
| file_types=[ | |
| ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".htm", | |
| ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", | |
| ], | |
| type="filepath", | |
| height=160, | |
| ) | |
| btn_convert = gr.Button( | |
| "β‘ Convert to Markdown", | |
| variant="primary", | |
| elem_id="btn-convert", | |
| ) | |
| status_out = gr.HTML(value="", elem_id="status-box") | |
| btn_clear = gr.Button("β Clear", elem_id="btn-clear", size="sm") | |
| gr.Markdown( | |
| "**Supported formats**\n\n" | |
| "πΌ JPG Β· PNG Β· TIFF Β· WebP Β· BMP\n\n" | |
| "π PDF *(max 20 pages)*\n\n" | |
| "π DOCX Β· PPTX Β· XLSX\n\n" | |
| "π HTML\n\n" | |
| "---\n" | |
| "*OCR applied automatically*", | |
| elem_id="fmt-list", | |
| ) | |
| # ββ Right column ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=3, min_width=400): | |
| # Toolbar | |
| with gr.Row(): | |
| gr.HTML('<span id="toolbar-label">Markdown Editor</span>') | |
| with gr.Row(): | |
| btn_copy = gr.Button("β Copy", elem_id="btn-copy", size="sm") | |
| btn_dl = gr.DownloadButton( | |
| label="β Download .md", | |
| value=None, | |
| size="sm", | |
| elem_id="btn-dl", | |
| ) | |
| md_editor = gr.Textbox( | |
| value="", | |
| label="", | |
| placeholder="Your converted Markdown will appear here.\nYou can edit it freely before copying or downloading.", | |
| lines=28, | |
| max_lines=80, | |
| interactive=True, | |
| elem_id="md-editor", | |
| show_label=False, | |
| ) | |
| # ββ Wiring ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run(file): | |
| md, status, dl = convert_file(file) | |
| return md, status, dl | |
| btn_convert.click( | |
| fn=run, | |
| inputs=[file_input], | |
| outputs=[md_editor, status_out, btn_dl], | |
| api_name="convert", | |
| ) | |
| md_editor.change( | |
| fn=regen_download, | |
| inputs=[md_editor], | |
| outputs=[btn_dl], | |
| api_name=False, | |
| ) | |
| btn_copy.click(fn=None, js=COPY_JS) | |
| def do_clear(): | |
| return None, "", "", None | |
| btn_clear.click( | |
| fn=do_clear, | |
| inputs=[], | |
| outputs=[file_input, md_editor, status_out, btn_dl], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(css=CSS) |