Spaces:
Sleeping
Sleeping
File size: 3,486 Bytes
7f7bfa6 f13a7e1 d009d38 f13a7e1 d009d38 1a0b2db d009d38 f13a7e1 d009d38 f13a7e1 d009d38 1a0b2db d009d38 7f7bfa6 d009d38 f13a7e1 d009d38 7f7bfa6 f13a7e1 7f7bfa6 d009d38 f13a7e1 d009d38 7f7bfa6 d009d38 f13a7e1 fe22f31 d009d38 1a0b2db d009d38 f13a7e1 d009d38 1a0b2db d009d38 1a0b2db d009d38 1a0b2db 7f7bfa6 d009d38 1a0b2db d009d38 7f7bfa6 f13a7e1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import gradio as gr
import fitz # PyMuPDF
from docx import Document
import tempfile
import os
# ---------- PDF (best-effort) ----------
def pdf_to_markdown(file_path):
doc = fitz.open(file_path)
lines = []
for page in doc:
text = page.get_text("text")
for line in text.split("\n"):
line = line.strip()
if line:
lines.append(line)
return "\n\n".join(lines)
# ---------- DOCX (structure-aware) ----------
def docx_to_markdown(file_path):
doc = Document(file_path)
md = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
md.append("")
continue
# Headings
if para.style.name.startswith("Heading"):
try:
level = int(para.style.name.replace("Heading", ""))
except:
level = 1
md.append("#" * level + " " + text)
continue
# Bullet lists
if "List Bullet" in para.style.name:
md.append(f"- {text}")
continue
# Numbered lists
if "List Number" in para.style.name:
md.append(f"1. {text}")
continue
# Inline formatting
formatted = ""
for run in para.runs:
run_text = run.text
if not run_text:
continue
if run.bold and run.italic:
run_text = f"***{run_text}***"
elif run.bold:
run_text = f"**{run_text}**"
elif run.italic:
run_text = f"*{run_text}*"
formatted += run_text
md.append(formatted)
# Tables
for table in doc.tables:
md.append("")
headers = [cell.text.strip() for cell in table.rows[0].cells]
md.append("| " + " | ".join(headers) + " |")
md.append("| " + " | ".join(["---"] * len(headers)) + " |")
for row in table.rows[1:]:
cells = [cell.text.strip() for cell in row.cells]
md.append("| " + " | ".join(cells) + " |")
return "\n".join(md)
# ---------- Main handler ----------
def convert_file(uploaded_file):
if uploaded_file is None:
return "", None
file_path = uploaded_file.name
ext = os.path.splitext(file_path)[1].lower()
if ext == ".docx":
markdown_text = docx_to_markdown(file_path)
elif ext == ".pdf":
markdown_text = pdf_to_markdown(file_path)
else:
return "Unsupported file type.", None
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
with open(tmp.name, "w", encoding="utf-8") as f:
f.write(markdown_text)
return markdown_text, tmp.name
# ---------- UI ----------
with gr.Blocks() as demo:
gr.Markdown("# 📄➡️📝 Document → Markdown Converter")
gr.Markdown(
"""
**DOCX:** High-quality Markdown (headings, lists, bold, italics, tables)
**PDF:** Best-effort text conversion (PDFs don’t store structure)
"""
)
with gr.Row():
file_input = gr.File(
label="Upload PDF or DOCX",
file_types=[".pdf", ".docx"]
)
convert_btn = gr.Button("Convert")
with gr.Row():
md_preview = gr.Markdown(label="Live Markdown Preview")
md_download = gr.File(label="Download .md file")
convert_btn.click(
fn=convert_file,
inputs=file_input,
outputs=[md_preview, md_download]
)
demo.launch()
|