docling / app.py
sonuprasad23's picture
Update app.py
83b6b2c verified
import gradio as gr
import tempfile
import time
from pathlib import Path
# ── Docling ────────────────────────────────────────────────────────────────────
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.datamodel.base_models import InputFormat
from docling.document_converter import ImageFormatOption
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
MAX_PDF_PAGES = 40
SUPPORTED_IMAGE = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"]
SUPPORTED_DOCS = [".pdf", ".docx", ".pptx", ".html", ".htm", ".xlsx"]
# ── Build converter once at startup ───────────────────────────────────────────
def _build():
ocr = EasyOcrOptions(force_full_page_ocr=True)
pdf = PdfPipelineOptions()
pdf.do_ocr = True
pdf.ocr_options = ocr
pdf.do_table_structure = True
pdf.table_structure_options.do_cell_matching = True
return DocumentConverter(
allowed_formats=[InputFormat.PDF, InputFormat.IMAGE,
InputFormat.DOCX, InputFormat.PPTX,
InputFormat.HTML, InputFormat.XLSX],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=pdf,
backend=PyPdfiumDocumentBackend,
),
InputFormat.IMAGE: ImageFormatOption(pipeline_cls=SimplePipeline),
},
)
CONVERTER = _build()
# ── Helpers ────────────────────────────────────────────────────────────────────
def _pdf_pages(path):
try:
import pypdfium2 as pdfium
d = pdfium.PdfDocument(path)
n = len(d); d.close(); return n
except Exception:
return 0
def _save_md(text):
t = tempfile.NamedTemporaryFile(delete=False, suffix=".md",
mode="w", encoding="utf-8")
t.write(text); t.close()
return t.name
# ── Core logic ─────────────────────────────────────────────────────────────────
def convert_file(file_path: str):
"""
API endpoint: upload a file β†’ get Markdown back.
Returns (markdown, status_html, download_path).
"""
if not file_path:
return "", _status("", ""), None
ext = Path(file_path).suffix.lower()
if ext == ".pdf":
n = _pdf_pages(file_path)
if n > MAX_PDF_PAGES:
msg = f"PDF has {n} pages β€” limit is {MAX_PDF_PAGES}."
return "", _status("error", msg), None
if ext not in SUPPORTED_IMAGE + SUPPORTED_DOCS:
msg = f"Unsupported type: {ext}"
return "", _status("error", msg), None
try:
t0 = time.time()
result = CONVERTER.convert(file_path)
elapsed = time.time() - t0
md = result.document.export_to_markdown()
if not md.strip():
return "", _status("warn", "No text extracted."), None
words = len(md.split())
msg = f"Done in {elapsed:.1f}s  Β·  {words:,} words  Β·  {len(md):,} chars"
return md, _status("ok", msg), _save_md(md)
except Exception as e:
return "", _status("error", str(e)), None
def _status(kind, msg):
colors = {"ok": "#22863a", "error": "#cb2431", "warn": "#b08800", "": "#666"}
c = colors.get(kind, "#666")
icons = {"ok": "βœ“", "error": "βœ—", "warn": "⚠", "": ""}
i = icons.get(kind, "")
if not msg:
return ""
return f'<p style="margin:0;font-size:0.8rem;font-family:monospace;color:{c}">{i} {msg}</p>'
def regen_download(md):
if md and md.strip():
return _save_md(md)
return None
# ── CSS ────────────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&family=JetBrains+Mono:wght@400;500&display=swap');
:root {
--bg: #fafafa;
--card: #ffffff;
--border: #e4e4e7;
--text: #09090b;
--muted: #71717a;
--accent: #18181b;
--blue: #2563eb;
--r: 10px;
--sans: 'Inter', system-ui, sans-serif;
--mono: 'JetBrains Mono', monospace;
}
/* ── Reset & base ── */
*, *::before, *::after { box-sizing: border-box; }
html, body { margin: 0; padding: 0; }
body, .gradio-container, .svelte-1kyws56 {
background: var(--bg) !important;
font-family: var(--sans) !important;
color: var(--text) !important;
}
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
padding: 0 16px 48px !important;
}
footer, .built-with, #footer { display: none !important; }
/* ── Header ── */
#docling-header {
padding: 32px 4px 24px;
border-bottom: 1px solid var(--border);
margin-bottom: 24px;
}
#docling-header h1 {
font-size: 1.25rem;
font-weight: 600;
color: var(--text);
margin: 0 0 4px;
letter-spacing: -0.025em;
font-family: var(--sans);
}
#docling-header p {
font-size: 0.85rem;
color: var(--muted);
margin: 0;
font-weight: 400;
font-family: var(--sans);
}
/* ── Cards ── */
.card {
background: var(--card);
border: 1px solid var(--border);
border-radius: var(--r);
padding: 20px;
}
/* ── Upload ── */
#upload-box { margin-bottom: 0 !important; }
#upload-box > .wrap {
border: 2px dashed var(--border) !important;
border-radius: var(--r) !important;
background: var(--bg) !important;
transition: border-color 0.15s ease, background 0.15s ease !important;
padding: 28px 16px !important;
cursor: pointer !important;
}
#upload-box > .wrap:hover {
border-color: var(--blue) !important;
background: #eff6ff !important;
}
#upload-box .wrap svg { color: var(--muted) !important; }
#upload-box .wrap .text-center span {
font-family: var(--sans) !important;
font-size: 0.82rem !important;
color: var(--muted) !important;
}
#upload-box label.svelte-1b6s6g { display: none !important; }
/* ── Primary button ── */
#btn-convert {
background: var(--accent) !important;
color: #fff !important;
border: none !important;
border-radius: var(--r) !important;
font-family: var(--sans) !important;
font-size: 0.875rem !important;
font-weight: 500 !important;
height: 42px !important;
width: 100% !important;
cursor: pointer !important;
transition: background 0.15s ease !important;
letter-spacing: -0.01em !important;
}
#btn-convert:hover { background: #3f3f46 !important; }
#btn-convert:active { background: #52525b !important; }
/* ── Ghost buttons ── */
#btn-clear, #btn-copy, #btn-dl button, #btn-dl a {
background: transparent !important;
color: var(--muted) !important;
border: 1px solid var(--border) !important;
border-radius: var(--r) !important;
font-family: var(--sans) !important;
font-size: 0.8rem !important;
font-weight: 400 !important;
cursor: pointer !important;
transition: border-color 0.15s, color 0.15s, background 0.15s !important;
text-decoration: none !important;
display: inline-flex !important;
align-items: center !important;
gap: 5px !important;
padding: 8px 14px !important;
height: 36px !important;
white-space: nowrap !important;
}
#btn-clear:hover { border-color: #a1a1aa !important; color: var(--text) !important; }
#btn-copy:hover { border-color: var(--blue) !important; color: var(--blue) !important; }
#btn-dl button:hover, #btn-dl a:hover {
border-color: var(--blue) !important;
color: var(--blue) !important;
background: #eff6ff !important;
}
#btn-clear { width: 100% !important; }
/* ── Status ── */
#status-box > div { padding: 0 !important; }
#status-box > div > div { background: transparent !important; border: none !important; padding: 0 !important; }
/* ── Formats chip list ── */
#fmt-list { padding: 0 !important; background: transparent !important; border: none !important; }
#fmt-list > div { background: transparent !important; border: none !important; padding: 0 !important; }
#fmt-list p {
font-family: var(--sans) !important;
font-size: 0.78rem !important;
color: var(--muted) !important;
line-height: 1.9 !important;
margin: 0 !important;
}
#fmt-list strong { color: var(--text) !important; font-weight: 500 !important; }
#fmt-list hr { border-color: var(--border) !important; margin: 10px 0 !important; }
#fmt-list em { font-style: normal !important; color: #a1a1aa !important; }
/* ── Editor ── */
#md-editor > label {
font-family: var(--sans) !important;
font-size: 0.72rem !important;
font-weight: 600 !important;
text-transform: uppercase !important;
letter-spacing: 0.08em !important;
color: var(--muted) !important;
}
#md-editor textarea {
font-family: var(--mono) !important;
font-size: 0.83rem !important;
line-height: 1.7 !important;
color: var(--text) !important;
background: var(--card) !important;
border: 1px solid var(--border) !important;
border-radius: var(--r) !important;
padding: 18px 20px !important;
resize: vertical !important;
transition: border-color 0.15s, box-shadow 0.15s !important;
caret-color: var(--blue) !important;
}
#md-editor textarea:focus {
border-color: #93c5fd !important;
outline: none !important;
box-shadow: 0 0 0 3px rgba(37,99,235,0.08) !important;
}
#md-editor textarea::placeholder {
color: #d4d4d8 !important;
font-style: italic !important;
}
/* ── Toolbar row ── */
#toolbar-row {
display: flex !important;
align-items: center !important;
justify-content: space-between !important;
flex-wrap: wrap !important;
gap: 8px !important;
margin-bottom: 8px !important;
}
#toolbar-row > * { flex-shrink: 0 !important; }
#toolbar-label {
font-family: var(--sans) !important;
font-size: 0.72rem !important;
font-weight: 600 !important;
text-transform: uppercase !important;
letter-spacing: 0.08em !important;
color: var(--muted) !important;
}
#toolbar-actions {
display: flex !important;
gap: 8px !important;
align-items: center !important;
}
/* ── Remove extra Gradio chrome ── */
.gap-4 { gap: 1rem !important; }
.gr-form { box-shadow: none !important; }
div.svelte-vt7nkj { gap: 0 !important; }
/* ── Responsive ── */
@media (max-width: 768px) {
.gradio-container { padding: 0 12px 32px !important; }
#docling-header { padding: 20px 4px 16px; margin-bottom: 16px; }
#docling-header h1 { font-size: 1.05rem; }
}
"""
# ── JS ─────────────────────────────────────────────────────────────────────────
COPY_JS = """
() => {
const ta = document.querySelector('#md-editor textarea');
if (!ta || !ta.value.trim()) return;
navigator.clipboard.writeText(ta.value).then(() => {
const btn = document.querySelector('#btn-copy');
if (!btn) return;
const orig = btn.innerHTML;
btn.innerHTML = 'βœ“&nbsp;Copied!';
btn.style.borderColor = '#16a34a';
btn.style.color = '#16a34a';
setTimeout(() => {
btn.innerHTML = orig;
btn.style.borderColor = '';
btn.style.color = '';
}, 2000);
});
}
"""
# ── UI ─────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Docling Β· OCR β†’ Markdown") as demo:
gr.HTML("""
<div id="docling-header">
<h1>πŸ“„ Docling OCR β†’ Markdown</h1>
<p>Convert images &amp; documents to clean, structured Markdown using OCR. Edit and download the result.</p>
</div>
""")
with gr.Row(equal_height=False):
# ── Left column ───────────────────────────────────────────────────────
with gr.Column(scale=1, min_width=260):
file_input = gr.File(
label="",
elem_id="upload-box",
file_types=[
".pdf", ".docx", ".pptx", ".xlsx", ".html", ".htm",
".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp",
],
type="filepath",
height=160,
)
btn_convert = gr.Button(
"⚑ Convert to Markdown",
variant="primary",
elem_id="btn-convert",
)
status_out = gr.HTML(value="", elem_id="status-box")
btn_clear = gr.Button("βœ• Clear", elem_id="btn-clear", size="sm")
gr.Markdown(
"**Supported formats**\n\n"
"πŸ–Ό&nbsp; JPG Β· PNG Β· TIFF Β· WebP Β· BMP\n\n"
"πŸ“„&nbsp; PDF *(max 20 pages)*\n\n"
"πŸ“&nbsp; DOCX Β· PPTX Β· XLSX\n\n"
"🌐&nbsp; HTML\n\n"
"---\n"
"*OCR applied automatically*",
elem_id="fmt-list",
)
# ── Right column ──────────────────────────────────────────────────────
with gr.Column(scale=3, min_width=400):
# Toolbar
with gr.Row():
gr.HTML('<span id="toolbar-label">Markdown Editor</span>')
with gr.Row():
btn_copy = gr.Button("⎘ Copy", elem_id="btn-copy", size="sm")
btn_dl = gr.DownloadButton(
label="↓ Download .md",
value=None,
size="sm",
elem_id="btn-dl",
)
md_editor = gr.Textbox(
value="",
label="",
placeholder="Your converted Markdown will appear here.\nYou can edit it freely before copying or downloading.",
lines=28,
max_lines=80,
interactive=True,
elem_id="md-editor",
show_label=False,
)
# ── Wiring ────────────────────────────────────────────────────────────────
def run(file):
md, status, dl = convert_file(file)
return md, status, dl
btn_convert.click(
fn=run,
inputs=[file_input],
outputs=[md_editor, status_out, btn_dl],
api_name="convert",
)
md_editor.change(
fn=regen_download,
inputs=[md_editor],
outputs=[btn_dl],
api_name=False,
)
btn_copy.click(fn=None, js=COPY_JS)
def do_clear():
return None, "", "", None
btn_clear.click(
fn=do_clear,
inputs=[],
outputs=[file_input, md_editor, status_out, btn_dl],
)
if __name__ == "__main__":
demo.launch(css=CSS)