MarkdownMaker / app.py
rbughao's picture
Update app.py
d009d38 verified
import gradio as gr
import fitz # PyMuPDF
from docx import Document
import tempfile
import os
# ---------- PDF (best-effort) ----------
def pdf_to_markdown(file_path):
doc = fitz.open(file_path)
lines = []
for page in doc:
text = page.get_text("text")
for line in text.split("\n"):
line = line.strip()
if line:
lines.append(line)
return "\n\n".join(lines)
# ---------- DOCX (structure-aware) ----------
def docx_to_markdown(file_path):
doc = Document(file_path)
md = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
md.append("")
continue
# Headings
if para.style.name.startswith("Heading"):
try:
level = int(para.style.name.replace("Heading", ""))
except:
level = 1
md.append("#" * level + " " + text)
continue
# Bullet lists
if "List Bullet" in para.style.name:
md.append(f"- {text}")
continue
# Numbered lists
if "List Number" in para.style.name:
md.append(f"1. {text}")
continue
# Inline formatting
formatted = ""
for run in para.runs:
run_text = run.text
if not run_text:
continue
if run.bold and run.italic:
run_text = f"***{run_text}***"
elif run.bold:
run_text = f"**{run_text}**"
elif run.italic:
run_text = f"*{run_text}*"
formatted += run_text
md.append(formatted)
# Tables
for table in doc.tables:
md.append("")
headers = [cell.text.strip() for cell in table.rows[0].cells]
md.append("| " + " | ".join(headers) + " |")
md.append("| " + " | ".join(["---"] * len(headers)) + " |")
for row in table.rows[1:]:
cells = [cell.text.strip() for cell in row.cells]
md.append("| " + " | ".join(cells) + " |")
return "\n".join(md)
# ---------- Main handler ----------
def convert_file(uploaded_file):
if uploaded_file is None:
return "", None
file_path = uploaded_file.name
ext = os.path.splitext(file_path)[1].lower()
if ext == ".docx":
markdown_text = docx_to_markdown(file_path)
elif ext == ".pdf":
markdown_text = pdf_to_markdown(file_path)
else:
return "Unsupported file type.", None
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
with open(tmp.name, "w", encoding="utf-8") as f:
f.write(markdown_text)
return markdown_text, tmp.name
# ---------- UI ----------
with gr.Blocks() as demo:
gr.Markdown("# 📄➡️📝 Document → Markdown Converter")
gr.Markdown(
"""
**DOCX:** High-quality Markdown (headings, lists, bold, italics, tables)
**PDF:** Best-effort text conversion (PDFs don’t store structure)
"""
)
with gr.Row():
file_input = gr.File(
label="Upload PDF or DOCX",
file_types=[".pdf", ".docx"]
)
convert_btn = gr.Button("Convert")
with gr.Row():
md_preview = gr.Markdown(label="Live Markdown Preview")
md_download = gr.File(label="Download .md file")
convert_btn.click(
fn=convert_file,
inputs=file_input,
outputs=[md_preview, md_download]
)
demo.launch()