Spaces:

rbughao
/

MarkdownMaker

Sleeping

App Files Files Community

rbughao commited on Jan 26

Commit

d009d38

verified ·

1 Parent(s): 4946c48

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -29

app.py CHANGED Viewed

@@ -1,56 +1,136 @@
 import gradio as gr
 import fitz  # PyMuPDF
-import docx
-from markdownify import markdownify as md
-import os
 import tempfile
-def extract_text_from_pdf(file_path):
-    text = ""
     doc = fitz.open(file_path)
     for page in doc:
-        text += page.get_text()
-    return text
-def extract_text_from_docx(file_path):
-    doc = docx.Document(file_path)
-    return "\n".join([p.text for p in doc.paragraphs])
-def convert_to_markdown(uploaded_file):
     if uploaded_file is None:
-        return None
     file_path = uploaded_file.name
     ext = os.path.splitext(file_path)[1].lower()
-    if ext == ".pdf":
-        text = extract_text_from_pdf(file_path)
-    elif ext == ".docx":
-        text = extract_text_from_docx(file_path)
     else:
-        return None
-    markdown_text = md(text)
-    temp_md = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
-    with open(temp_md.name, "w", encoding="utf-8") as f:
         f.write(markdown_text)
-    return temp_md.name
 with gr.Blocks() as demo:
-    gr.Markdown("# 📄➡️📝 Document to Markdown Converter")
-    gr.Markdown("Upload a **PDF or Word (.docx)** file and get a **Markdown (.md)** file.")
-    file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
-    output_file = gr.File(label="Download Markdown File")
-    convert_btn = gr.Button("Convert to Markdown")
     convert_btn.click(
-        fn=convert_to_markdown,
         inputs=file_input,
-        outputs=output_file
     )
 demo.launch()

 import gradio as gr
 import fitz  # PyMuPDF
+from docx import Document
 import tempfile
+import os
+# ---------- PDF (best-effort) ----------
+def pdf_to_markdown(file_path):
     doc = fitz.open(file_path)
+    lines = []
     for page in doc:
+        text = page.get_text("text")
+        for line in text.split("\n"):
+            line = line.strip()
+            if line:
+                lines.append(line)
+    return "\n\n".join(lines)
+# ---------- DOCX (structure-aware) ----------
+def docx_to_markdown(file_path):
+    doc = Document(file_path)
+    md = []
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            md.append("")
+            continue
+        # Headings
+        if para.style.name.startswith("Heading"):
+            try:
+                level = int(para.style.name.replace("Heading", ""))
+            except:
+                level = 1
+            md.append("#" * level + " " + text)
+            continue
+        # Bullet lists
+        if "List Bullet" in para.style.name:
+            md.append(f"- {text}")
+            continue
+        # Numbered lists
+        if "List Number" in para.style.name:
+            md.append(f"1. {text}")
+            continue
+        # Inline formatting
+        formatted = ""
+        for run in para.runs:
+            run_text = run.text
+            if not run_text:
+                continue
+            if run.bold and run.italic:
+                run_text = f"***{run_text}***"
+            elif run.bold:
+                run_text = f"**{run_text}**"
+            elif run.italic:
+                run_text = f"*{run_text}*"
+            formatted += run_text
+        md.append(formatted)
+    # Tables
+    for table in doc.tables:
+        md.append("")
+        headers = [cell.text.strip() for cell in table.rows[0].cells]
+        md.append("| " + " | ".join(headers) + " |")
+        md.append("| " + " | ".join(["---"] * len(headers)) + " |")
+        for row in table.rows[1:]:
+            cells = [cell.text.strip() for cell in row.cells]
+            md.append("| " + " | ".join(cells) + " |")
+    return "\n".join(md)
+# ---------- Main handler ----------
+def convert_file(uploaded_file):
     if uploaded_file is None:
+        return "", None
     file_path = uploaded_file.name
     ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".docx":
+        markdown_text = docx_to_markdown(file_path)
+    elif ext == ".pdf":
+        markdown_text = pdf_to_markdown(file_path)
     else:
+        return "Unsupported file type.", None
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
+    with open(tmp.name, "w", encoding="utf-8") as f:
         f.write(markdown_text)
+    return markdown_text, tmp.name
+# ---------- UI ----------
 with gr.Blocks() as demo:
+    gr.Markdown("# 📄➡️📝 Document → Markdown Converter")
+    gr.Markdown(
+        """
+**DOCX:** High-quality Markdown (headings, lists, bold, italics, tables)
+**PDF:** Best-effort text conversion (PDFs don’t store structure)
+"""
+    )
+    with gr.Row():
+        file_input = gr.File(
+            label="Upload PDF or DOCX",
+            file_types=[".pdf", ".docx"]
+        )
+    convert_btn = gr.Button("Convert")
+    with gr.Row():
+        md_preview = gr.Markdown(label="Live Markdown Preview")
+        md_download = gr.File(label="Download .md file")
     convert_btn.click(
+        fn=convert_file,
         inputs=file_input,
+        outputs=[md_preview, md_download]
     )
 demo.launch()