Spaces:

rbughao
/

MarkdownMaker

Sleeping

App Files Files Community

rbughao commited on Jan 26

Commit

1a0b2db

verified ·

1 Parent(s): ddcea1e

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -76

app.py CHANGED Viewed

@@ -1,102 +1,147 @@
-import io
-import os
-import datetime
 import gradio as gr
 from markitdown import MarkItDown
-md = MarkItDown()
-def convert_file(file, output_format):
-    if file is None:
-        return gr.update(value="Please upload a file."), None
     try:
-        result = md.convert(file.name)
-        text = getattr(result, "text_content", None)
-        if not text:
-            text = getattr(result, "markdown_content", "")
-        if not text:
-            text = "No textual content extracted."
-        base = os.path.splitext(os.path.basename(file.name))[0]
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        if output_format == "markdown":
-            out_name = f"{base}_extracted_{timestamp}.md"
-        else:
-            out_name = f"{base}_extracted_{timestamp}.txt"
-        bytes_io = io.BytesIO(text.encode("utf-8"))
-        bytes_io.seek(0)
-        return text, (out_name, bytes_io)
-    except Exception as e:
-        return f"❌ Conversion failed: {e}", None
-copy_js = """
-() => {
-  const tb = document.querySelector('textarea');
-  if (!tb) { alert('Nothing to copy'); return; }
-  tb.select();
-  document.execCommand('copy');
-  alert('Copied to clipboard');
-}
-"""
-with gr.Blocks(title="MarkItDown - Document Extractor") as demo:
-    gr.Markdown(
-        """
-        # 📝 MarkItDown – Document Text Extractor
-        Upload a **PDF, DOCX, PPTX, EML, HTML**, or similar file and extract clean text using https://github.com/microsoft/markitdown.
-        """
     )
     with gr.Row():
-        file_input = gr.File(
-            label="Upload a document",
-            file_count="single",
-            type="filepath",
-            file_types=[".pdf", ".docx", ".pptx", ".html", ".htm", ".eml", ".txt", ".md", ".rtf"],
         )
-        output_format = gr.Radio(
-            choices=["markdown", "text"],
-            value="markdown",
-            label="Download format",
         )
-    with gr.Row():
-        convert_btn = gr.Button("Convert", variant="primary")
-        clear_btn = gr.Button("Clear")
-        copy_btn = gr.Button("Copy Text")
-    text_output = gr.Textbox(
-        label="Extracted Text",
-        lines=20
-    )
-    download_file = gr.File(
-        label="Download Extracted File",
-        interactive=False
-    )
     convert_btn.click(
-        fn=convert_file,
-        inputs=[file_input, output_format],
-        outputs=[text_output, download_file],
-        api_name="convert"
-    )
-    clear_btn.click(
-        fn=lambda: (None, "", None),
-        inputs=[],
-        outputs=[file_input, text_output, download_file]
     )
-    # Client-side copy to clipboard
-    copy_btn.click(None, [], [], js=copy_js)
 if __name__ == "__main__":
-    demo.launch()

+# app.py
 import gradio as gr
 from markitdown import MarkItDown
+from pathlib import Path
+import tempfile
+import os
+# Increase max file size if you expect very large documents
+MAX_FILE_SIZE_MB = 50
+def convert_to_markdown(file_obj):
+    if file_obj is None:
+        return "Please upload a document.", ""
     try:
+        # Get original filename and extension
+        original_name = Path(file_obj.name).stem
+        ext = Path(file_obj.name).suffix.lower()
+        allowed_extensions = [
+            '.pdf', '.doc', '.docx', '.ppt', '.pptx',
+            '.xls', '.xlsx', '.odt', '.rtf', '.txt', '.md'
+        ]
+        if ext not in allowed_extensions:
+            return (
+                f"Unsupported file format: {ext}\n\n"
+                f"Supported formats: {', '.join(allowed_extensions)}",
+                ""
+            )
+        # Create temporary file (markitdown expects a real file path)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
+            tmp.write(file_obj.read())
+            tmp_path = tmp.name
+        try:
+            md = MarkItDown()
+            result = md.convert(tmp_path)
+            markdown_text = result.text_content
+            # Optional: better default filename suggestion
+            suggested_filename = f"{original_name}.md"
+            return (
+                f"Conversion successful! ✓\nFile: {original_name}{ext}",
+                markdown_text,
+                suggested_filename
+            )
+        finally:
+            # Clean up temporary file
+            try:
+                os.unlink(tmp_path)
+            except:
+                pass
+    except Exception as e:
+        import traceback
+        error_msg = traceback.format_exc()
+        return f"Error during conversion:\n{str(e)}\n\n{error_msg[:800]}…", "", ""
+# ────────────────────────────────────────────────
+#                Gradio Interface
+# ────────────────────────────────────────────────
+css = """
+.upload-box {border: 1px solid #ccc; border-radius: 8px; padding: 16px; background: #fafafa;}
+.success {color: #2e7d32;}
+.error   {color: #c62828;}
+"""
+with gr.Blocks(title="Document → Markdown Converter", css=css) as demo:
+    gr.Markdown("""
+    # Document → Markdown Converter
+    Upload PDF, Word, PowerPoint, Excel, ... → get clean Markdown
+    Powered by **markitdown** • Works best with text-heavy documents
+    """)
+    with gr.Row():
+        with gr.Column(scale=4):
+            file_input = gr.File(
+                label="Upload document",
+                file_count="single",
+                file_types=[
+                    ".pdf", ".doc", ".docx", ".ppt", ".pptx",
+                    ".xls", ".xlsx", ".odt", ".rtf", ".txt", ".md"
+                ],
+                elem_classes="upload-box",
+                max_size=f"{MAX_FILE_SIZE_MB}MB"
+            )
+        with gr.Column(scale=1, min_width=180):
+            convert_btn = gr.Button("Convert to Markdown", variant="primary", scale=1)
+    status_output = gr.Textbox(
+        label="Status",
+        lines=3,
+        interactive=False,
+        show_copy_button=False
+    )
+    markdown_output = gr.Markdown(
+        label="Converted Markdown",
+        height=500
     )
     with gr.Row():
+        download_file = gr.File(
+            label="Download markdown file",
+            file_types=[".md"],
+            interactive=False
         )
+        download_name = gr.Textbox(
+            label="Suggested filename",
+            value="document.md",
+            interactive=True,
+            max_lines=1
         )
+    # ─── Main action ───────────────────────────────────────
     convert_btn.click(
+        fn=convert_to_markdown,
+        inputs=file_input,
+        outputs=[status_output, markdown_output, download_name]
+    ).then(
+        fn=lambda md_text, fname: gr.File(value=md_text, filename=fname) if md_text else None,
+        inputs=[markdown_output, download_name],
+        outputs=download_file
     )
+    gr.Markdown("""
+    ### Notes
+    - Maximum file size: ~{MAX_FILE_SIZE_MB} MB (Hugging Face free tier limit is usually higher)
+    - Best results with text-oriented documents
+    - Tables, images, complex layouts may be simplified
+    - Very large documents may take longer or timeout
+    """)
 if __name__ == "__main__":
+    demo.launch()