File size: 3,486 Bytes
7f7bfa6
f13a7e1
d009d38
f13a7e1
d009d38
 
1a0b2db
d009d38
 
f13a7e1
d009d38
 
f13a7e1
d009d38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0b2db
d009d38
 
 
 
 
 
7f7bfa6
d009d38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f13a7e1
d009d38
7f7bfa6
f13a7e1
 
7f7bfa6
d009d38
 
 
 
f13a7e1
d009d38
7f7bfa6
d009d38
 
f13a7e1
fe22f31
d009d38
 
1a0b2db
d009d38
f13a7e1
d009d38
 
 
 
 
 
 
 
 
 
 
 
 
1a0b2db
d009d38
1a0b2db
d009d38
 
 
1a0b2db
7f7bfa6
d009d38
1a0b2db
d009d38
7f7bfa6
 
f13a7e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
import tempfile
import os


# ---------- PDF (best-effort) ----------
def pdf_to_markdown(file_path):
    doc = fitz.open(file_path)
    lines = []

    for page in doc:
        text = page.get_text("text")
        for line in text.split("\n"):
            line = line.strip()
            if line:
                lines.append(line)

    return "\n\n".join(lines)


# ---------- DOCX (structure-aware) ----------
def docx_to_markdown(file_path):
    doc = Document(file_path)
    md = []

    for para in doc.paragraphs:
        text = para.text.strip()

        if not text:
            md.append("")
            continue

        # Headings
        if para.style.name.startswith("Heading"):
            try:
                level = int(para.style.name.replace("Heading", ""))
            except:
                level = 1
            md.append("#" * level + " " + text)
            continue

        # Bullet lists
        if "List Bullet" in para.style.name:
            md.append(f"- {text}")
            continue

        # Numbered lists
        if "List Number" in para.style.name:
            md.append(f"1. {text}")
            continue

        # Inline formatting
        formatted = ""
        for run in para.runs:
            run_text = run.text
            if not run_text:
                continue

            if run.bold and run.italic:
                run_text = f"***{run_text}***"
            elif run.bold:
                run_text = f"**{run_text}**"
            elif run.italic:
                run_text = f"*{run_text}*"

            formatted += run_text

        md.append(formatted)

    # Tables
    for table in doc.tables:
        md.append("")
        headers = [cell.text.strip() for cell in table.rows[0].cells]
        md.append("| " + " | ".join(headers) + " |")
        md.append("| " + " | ".join(["---"] * len(headers)) + " |")

        for row in table.rows[1:]:
            cells = [cell.text.strip() for cell in row.cells]
            md.append("| " + " | ".join(cells) + " |")

    return "\n".join(md)


# ---------- Main handler ----------
def convert_file(uploaded_file):
    if uploaded_file is None:
        return "", None

    file_path = uploaded_file.name
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".docx":
        markdown_text = docx_to_markdown(file_path)
    elif ext == ".pdf":
        markdown_text = pdf_to_markdown(file_path)
    else:
        return "Unsupported file type.", None

    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
    with open(tmp.name, "w", encoding="utf-8") as f:
        f.write(markdown_text)

    return markdown_text, tmp.name


# ---------- UI ----------
with gr.Blocks() as demo:
    gr.Markdown("# 📄➡️📝 Document → Markdown Converter")
    gr.Markdown(
        """
**DOCX:** High-quality Markdown (headings, lists, bold, italics, tables)  
**PDF:** Best-effort text conversion (PDFs don’t store structure)
"""
    )

    with gr.Row():
        file_input = gr.File(
            label="Upload PDF or DOCX",
            file_types=[".pdf", ".docx"]
        )

    convert_btn = gr.Button("Convert")

    with gr.Row():
        md_preview = gr.Markdown(label="Live Markdown Preview")
        md_download = gr.File(label="Download .md file")

    convert_btn.click(
        fn=convert_file,
        inputs=file_input,
        outputs=[md_preview, md_download]
    )

demo.launch()