Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| import tempfile | |
| import os | |
| # ---------- PDF (best-effort) ---------- | |
| def pdf_to_markdown(file_path): | |
| doc = fitz.open(file_path) | |
| lines = [] | |
| for page in doc: | |
| text = page.get_text("text") | |
| for line in text.split("\n"): | |
| line = line.strip() | |
| if line: | |
| lines.append(line) | |
| return "\n\n".join(lines) | |
| # ---------- DOCX (structure-aware) ---------- | |
| def docx_to_markdown(file_path): | |
| doc = Document(file_path) | |
| md = [] | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if not text: | |
| md.append("") | |
| continue | |
| # Headings | |
| if para.style.name.startswith("Heading"): | |
| try: | |
| level = int(para.style.name.replace("Heading", "")) | |
| except: | |
| level = 1 | |
| md.append("#" * level + " " + text) | |
| continue | |
| # Bullet lists | |
| if "List Bullet" in para.style.name: | |
| md.append(f"- {text}") | |
| continue | |
| # Numbered lists | |
| if "List Number" in para.style.name: | |
| md.append(f"1. {text}") | |
| continue | |
| # Inline formatting | |
| formatted = "" | |
| for run in para.runs: | |
| run_text = run.text | |
| if not run_text: | |
| continue | |
| if run.bold and run.italic: | |
| run_text = f"***{run_text}***" | |
| elif run.bold: | |
| run_text = f"**{run_text}**" | |
| elif run.italic: | |
| run_text = f"*{run_text}*" | |
| formatted += run_text | |
| md.append(formatted) | |
| # Tables | |
| for table in doc.tables: | |
| md.append("") | |
| headers = [cell.text.strip() for cell in table.rows[0].cells] | |
| md.append("| " + " | ".join(headers) + " |") | |
| md.append("| " + " | ".join(["---"] * len(headers)) + " |") | |
| for row in table.rows[1:]: | |
| cells = [cell.text.strip() for cell in row.cells] | |
| md.append("| " + " | ".join(cells) + " |") | |
| return "\n".join(md) | |
| # ---------- Main handler ---------- | |
| def convert_file(uploaded_file): | |
| if uploaded_file is None: | |
| return "", None | |
| file_path = uploaded_file.name | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == ".docx": | |
| markdown_text = docx_to_markdown(file_path) | |
| elif ext == ".pdf": | |
| markdown_text = pdf_to_markdown(file_path) | |
| else: | |
| return "Unsupported file type.", None | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md") | |
| with open(tmp.name, "w", encoding="utf-8") as f: | |
| f.write(markdown_text) | |
| return markdown_text, tmp.name | |
| # ---------- UI ---------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 📄➡️📝 Document → Markdown Converter") | |
| gr.Markdown( | |
| """ | |
| **DOCX:** High-quality Markdown (headings, lists, bold, italics, tables) | |
| **PDF:** Best-effort text conversion (PDFs don’t store structure) | |
| """ | |
| ) | |
| with gr.Row(): | |
| file_input = gr.File( | |
| label="Upload PDF or DOCX", | |
| file_types=[".pdf", ".docx"] | |
| ) | |
| convert_btn = gr.Button("Convert") | |
| with gr.Row(): | |
| md_preview = gr.Markdown(label="Live Markdown Preview") | |
| md_download = gr.File(label="Download .md file") | |
| convert_btn.click( | |
| fn=convert_file, | |
| inputs=file_input, | |
| outputs=[md_preview, md_download] | |
| ) | |
| demo.launch() | |