import gradio as gr import fitz # PyMuPDF from docx import Document import tempfile import os # ---------- PDF (best-effort) ---------- def pdf_to_markdown(file_path): doc = fitz.open(file_path) lines = [] for page in doc: text = page.get_text("text") for line in text.split("\n"): line = line.strip() if line: lines.append(line) return "\n\n".join(lines) # ---------- DOCX (structure-aware) ---------- def docx_to_markdown(file_path): doc = Document(file_path) md = [] for para in doc.paragraphs: text = para.text.strip() if not text: md.append("") continue # Headings if para.style.name.startswith("Heading"): try: level = int(para.style.name.replace("Heading", "")) except: level = 1 md.append("#" * level + " " + text) continue # Bullet lists if "List Bullet" in para.style.name: md.append(f"- {text}") continue # Numbered lists if "List Number" in para.style.name: md.append(f"1. {text}") continue # Inline formatting formatted = "" for run in para.runs: run_text = run.text if not run_text: continue if run.bold and run.italic: run_text = f"***{run_text}***" elif run.bold: run_text = f"**{run_text}**" elif run.italic: run_text = f"*{run_text}*" formatted += run_text md.append(formatted) # Tables for table in doc.tables: md.append("") headers = [cell.text.strip() for cell in table.rows[0].cells] md.append("| " + " | ".join(headers) + " |") md.append("| " + " | ".join(["---"] * len(headers)) + " |") for row in table.rows[1:]: cells = [cell.text.strip() for cell in row.cells] md.append("| " + " | ".join(cells) + " |") return "\n".join(md) # ---------- Main handler ---------- def convert_file(uploaded_file): if uploaded_file is None: return "", None file_path = uploaded_file.name ext = os.path.splitext(file_path)[1].lower() if ext == ".docx": markdown_text = docx_to_markdown(file_path) elif ext == ".pdf": markdown_text = pdf_to_markdown(file_path) else: return "Unsupported file type.", None tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md") with open(tmp.name, "w", encoding="utf-8") as f: f.write(markdown_text) return markdown_text, tmp.name # ---------- UI ---------- with gr.Blocks() as demo: gr.Markdown("# πŸ“„βž‘οΈπŸ“ Document β†’ Markdown Converter") gr.Markdown( """ **DOCX:** High-quality Markdown (headings, lists, bold, italics, tables) **PDF:** Best-effort text conversion (PDFs don’t store structure) """ ) with gr.Row(): file_input = gr.File( label="Upload PDF or DOCX", file_types=[".pdf", ".docx"] ) convert_btn = gr.Button("Convert") with gr.Row(): md_preview = gr.Markdown(label="Live Markdown Preview") md_download = gr.File(label="Download .md file") convert_btn.click( fn=convert_file, inputs=file_input, outputs=[md_preview, md_download] ) demo.launch()