Spaces:

rbughao
/

MarkdownMaker

Sleeping

File size: 3,486 Bytes

7f7bfa6
f13a7e1
d009d38
f13a7e1
d009d38
 
1a0b2db
d009d38
 
f13a7e1
d009d38
 
f13a7e1
d009d38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0b2db
d009d38
 
 
 
 
 
7f7bfa6
d009d38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f13a7e1
d009d38
7f7bfa6
f13a7e1
 
7f7bfa6
d009d38
 
 
 
f13a7e1
d009d38
7f7bfa6
d009d38
 
f13a7e1
fe22f31
d009d38
 
1a0b2db
d009d38
f13a7e1
d009d38
 
 
 
 
 
 
 
 
 
 
 
 
1a0b2db
d009d38
1a0b2db
d009d38
 
 
1a0b2db
7f7bfa6
d009d38
1a0b2db
d009d38
7f7bfa6
 
f13a7e1

import gradio as gr
import fitz  # PyMuPDF
from docx import Document
import tempfile
import os


# ---------- PDF (best-effort) ----------
def pdf_to_markdown(file_path):
    doc = fitz.open(file_path)
    lines = []

    for page in doc:
        text = page.get_text("text")
        for line in text.split("\n"):
            line = line.strip()
            if line:
                lines.append(line)

    return "\n\n".join(lines)


# ---------- DOCX (structure-aware) ----------
def docx_to_markdown(file_path):
    doc = Document(file_path)
    md = []

    for para in doc.paragraphs:
        text = para.text.strip()

        if not text:
            md.append("")
            continue

        # Headings
        if para.style.name.startswith("Heading"):
            try:
                level = int(para.style.name.replace("Heading", ""))
            except:
                level = 1
            md.append("#" * level + " " + text)
            continue

        # Bullet lists
        if "List Bullet" in para.style.name:
            md.append(f"- {text}")
            continue

        # Numbered lists
        if "List Number" in para.style.name:
            md.append(f"1. {text}")
            continue

        # Inline formatting
        formatted = ""
        for run in para.runs:
            run_text = run.text
            if not run_text:
                continue

            if run.bold and run.italic:
                run_text = f"***{run_text}***"
            elif run.bold:
                run_text = f"**{run_text}**"
            elif run.italic:
                run_text = f"*{run_text}*"

            formatted += run_text

        md.append(formatted)

    # Tables
    for table in doc.tables:
        md.append("")
        headers = [cell.text.strip() for cell in table.rows[0].cells]
        md.append("| " + " | ".join(headers) + " |")
        md.append("| " + " | ".join(["---"] * len(headers)) + " |")

        for row in table.rows[1:]:
            cells = [cell.text.strip() for cell in row.cells]
            md.append("| " + " | ".join(cells) + " |")

    return "\n".join(md)


# ---------- Main handler ----------
def convert_file(uploaded_file):
    if uploaded_file is None:
        return "", None

    file_path = uploaded_file.name
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".docx":
        markdown_text = docx_to_markdown(file_path)
    elif ext == ".pdf":
        markdown_text = pdf_to_markdown(file_path)
    else:
        return "Unsupported file type.", None

    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
    with open(tmp.name, "w", encoding="utf-8") as f:
        f.write(markdown_text)

    return markdown_text, tmp.name


# ---------- UI ----------
with gr.Blocks() as demo:
    gr.Markdown("# 📄➡️📝 Document → Markdown Converter")
    gr.Markdown(
        """
**DOCX:** High-quality Markdown (headings, lists, bold, italics, tables)  
**PDF:** Best-effort text conversion (PDFs don’t store structure)
"""
    )

    with gr.Row():
        file_input = gr.File(
            label="Upload PDF or DOCX",
            file_types=[".pdf", ".docx"]
        )

    convert_btn = gr.Button("Convert")

    with gr.Row():
        md_preview = gr.Markdown(label="Live Markdown Preview")
        md_download = gr.File(label="Download .md file")

    convert_btn.click(
        fn=convert_file,
        inputs=file_input,
        outputs=[md_preview, md_download]
    )

demo.launch()