Spaces:

rbughao
/

MarkdownMaker

Sleeping

App Files Files Community

MarkdownMaker / app.py

rbughao

Update app.py

d009d38 verified 18 days ago

raw

history blame contribute delete

3.49 kB

	import gradio as gr
	import fitz # PyMuPDF
	from docx import Document
	import tempfile
	import os


	# ---------- PDF (best-effort) ----------
	def pdf_to_markdown(file_path):
	doc = fitz.open(file_path)
	lines = []

	for page in doc:
	text = page.get_text("text")
	for line in text.split("\n"):
	line = line.strip()
	if line:
	lines.append(line)

	return "\n\n".join(lines)


	# ---------- DOCX (structure-aware) ----------
	def docx_to_markdown(file_path):
	doc = Document(file_path)
	md = []

	for para in doc.paragraphs:
	text = para.text.strip()

	if not text:
	md.append("")
	continue

	# Headings
	if para.style.name.startswith("Heading"):
	try:
	level = int(para.style.name.replace("Heading", ""))
	except:
	level = 1
	md.append("#" * level + " " + text)
	continue

	# Bullet lists
	if "List Bullet" in para.style.name:
	md.append(f"- {text}")
	continue

	# Numbered lists
	if "List Number" in para.style.name:
	md.append(f"1. {text}")
	continue

	# Inline formatting
	formatted = ""
	for run in para.runs:
	run_text = run.text
	if not run_text:
	continue

	if run.bold and run.italic:
	run_text = f"*{run_text}*"
	elif run.bold:
	run_text = f"{run_text}"
	elif run.italic:
	run_text = f"{run_text}"

	formatted += run_text

	md.append(formatted)

	# Tables
	for table in doc.tables:
	md.append("")
	headers = [cell.text.strip() for cell in table.rows[0].cells]
	md.append("\| " + " \| ".join(headers) + " \|")
	md.append("\| " + " \| ".join(["---"] * len(headers)) + " \|")

	for row in table.rows[1:]:
	cells = [cell.text.strip() for cell in row.cells]
	md.append("\| " + " \| ".join(cells) + " \|")

	return "\n".join(md)


	# ---------- Main handler ----------
	def convert_file(uploaded_file):
	if uploaded_file is None:
	return "", None

	file_path = uploaded_file.name
	ext = os.path.splitext(file_path)[1].lower()

	if ext == ".docx":
	markdown_text = docx_to_markdown(file_path)
	elif ext == ".pdf":
	markdown_text = pdf_to_markdown(file_path)
	else:
	return "Unsupported file type.", None

	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
	with open(tmp.name, "w", encoding="utf-8") as f:
	f.write(markdown_text)

	return markdown_text, tmp.name


	# ---------- UI ----------
	with gr.Blocks() as demo:
	gr.Markdown("# 📄➡️📝 Document → Markdown Converter")
	gr.Markdown(
	"""
	DOCX: High-quality Markdown (headings, lists, bold, italics, tables)
	PDF: Best-effort text conversion (PDFs don’t store structure)
	"""
	)

	with gr.Row():
	file_input = gr.File(
	label="Upload PDF or DOCX",
	file_types=[".pdf", ".docx"]
	)

	convert_btn = gr.Button("Convert")

	with gr.Row():
	md_preview = gr.Markdown(label="Live Markdown Preview")
	md_download = gr.File(label="Download .md file")

	convert_btn.click(
	fn=convert_file,
	inputs=file_input,
	outputs=[md_preview, md_download]
	)

	demo.launch()