rbughao commited on
Commit
1a0b2db
Β·
verified Β·
1 Parent(s): ddcea1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -76
app.py CHANGED
@@ -1,102 +1,147 @@
 
1
 
2
- import io
3
- import os
4
- import datetime
5
  import gradio as gr
6
  from markitdown import MarkItDown
 
 
 
7
 
8
- md = MarkItDown()
 
9
 
10
- def convert_file(file, output_format):
11
- if file is None:
12
- return gr.update(value="Please upload a file."), None
13
 
14
  try:
15
- result = md.convert(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- text = getattr(result, "text_content", None)
18
- if not text:
19
- text = getattr(result, "markdown_content", "")
20
- if not text:
21
- text = "No textual content extracted."
22
 
23
- base = os.path.splitext(os.path.basename(file.name))[0]
24
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
25
 
26
- if output_format == "markdown":
27
- out_name = f"{base}_extracted_{timestamp}.md"
28
- else:
29
- out_name = f"{base}_extracted_{timestamp}.txt"
30
 
31
- bytes_io = io.BytesIO(text.encode("utf-8"))
32
- bytes_io.seek(0)
33
- return text, (out_name, bytes_io)
 
 
34
 
35
- except Exception as e:
36
- return f"❌ Conversion failed: {e}", None
37
 
 
 
38
 
39
- copy_js = """
40
- () => {
41
- const tb = document.querySelector('textarea');
42
- if (!tb) { alert('Nothing to copy'); return; }
43
- tb.select();
44
- document.execCommand('copy');
45
- alert('Copied to clipboard');
46
- }
47
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- with gr.Blocks(title="MarkItDown - Document Extractor") as demo:
50
- gr.Markdown(
51
- """
52
- # πŸ“ MarkItDown – Document Text Extractor
53
- Upload a **PDF, DOCX, PPTX, EML, HTML**, or similar file and extract clean text using https://github.com/microsoft/markitdown.
54
- """
55
  )
56
 
57
  with gr.Row():
58
- file_input = gr.File(
59
- label="Upload a document",
60
- file_count="single",
61
- type="filepath",
62
- file_types=[".pdf", ".docx", ".pptx", ".html", ".htm", ".eml", ".txt", ".md", ".rtf"],
63
  )
64
- output_format = gr.Radio(
65
- choices=["markdown", "text"],
66
- value="markdown",
67
- label="Download format",
 
68
  )
69
 
70
- with gr.Row():
71
- convert_btn = gr.Button("Convert", variant="primary")
72
- clear_btn = gr.Button("Clear")
73
- copy_btn = gr.Button("Copy Text")
74
-
75
- text_output = gr.Textbox(
76
- label="Extracted Text",
77
- lines=20
78
- )
79
- download_file = gr.File(
80
- label="Download Extracted File",
81
- interactive=False
82
- )
83
-
84
  convert_btn.click(
85
- fn=convert_file,
86
- inputs=[file_input, output_format],
87
- outputs=[text_output, download_file],
88
- api_name="convert"
89
- )
90
-
91
- clear_btn.click(
92
- fn=lambda: (None, "", None),
93
- inputs=[],
94
- outputs=[file_input, text_output, download_file]
95
  )
96
 
97
- # Client-side copy to clipboard
98
- copy_btn.click(None, [], [], js=copy_js)
 
 
 
 
 
99
 
100
  if __name__ == "__main__":
101
- demo.launch()
102
-
 
1
+ # app.py
2
 
 
 
 
3
  import gradio as gr
4
  from markitdown import MarkItDown
5
+ from pathlib import Path
6
+ import tempfile
7
+ import os
8
 
9
+ # Increase max file size if you expect very large documents
10
+ MAX_FILE_SIZE_MB = 50
11
 
12
+ def convert_to_markdown(file_obj):
13
+ if file_obj is None:
14
+ return "Please upload a document.", ""
15
 
16
  try:
17
+ # Get original filename and extension
18
+ original_name = Path(file_obj.name).stem
19
+ ext = Path(file_obj.name).suffix.lower()
20
+
21
+ allowed_extensions = [
22
+ '.pdf', '.doc', '.docx', '.ppt', '.pptx',
23
+ '.xls', '.xlsx', '.odt', '.rtf', '.txt', '.md'
24
+ ]
25
+
26
+ if ext not in allowed_extensions:
27
+ return (
28
+ f"Unsupported file format: {ext}\n\n"
29
+ f"Supported formats: {', '.join(allowed_extensions)}",
30
+ ""
31
+ )
32
+
33
+ # Create temporary file (markitdown expects a real file path)
34
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
35
+ tmp.write(file_obj.read())
36
+ tmp_path = tmp.name
37
+
38
+ try:
39
+ md = MarkItDown()
40
+ result = md.convert(tmp_path)
41
+
42
+ markdown_text = result.text_content
43
+
44
+ # Optional: better default filename suggestion
45
+ suggested_filename = f"{original_name}.md"
46
+
47
+ return (
48
+ f"Conversion successful! βœ“\nFile: {original_name}{ext}",
49
+ markdown_text,
50
+ suggested_filename
51
+ )
52
+
53
+ finally:
54
+ # Clean up temporary file
55
+ try:
56
+ os.unlink(tmp_path)
57
+ except:
58
+ pass
59
 
60
+ except Exception as e:
61
+ import traceback
62
+ error_msg = traceback.format_exc()
63
+ return f"Error during conversion:\n{str(e)}\n\n{error_msg[:800]}…", "", ""
 
64
 
 
 
65
 
66
+ # ────────────────────────────────────────────────
67
+ # Gradio Interface
68
+ # ────────────────────────────────────────────────
 
69
 
70
+ css = """
71
+ .upload-box {border: 1px solid #ccc; border-radius: 8px; padding: 16px; background: #fafafa;}
72
+ .success {color: #2e7d32;}
73
+ .error {color: #c62828;}
74
+ """
75
 
76
+ with gr.Blocks(title="Document β†’ Markdown Converter", css=css) as demo:
 
77
 
78
+ gr.Markdown("""
79
+ # Document β†’ Markdown Converter
80
 
81
+ Upload PDF, Word, PowerPoint, Excel, ... β†’ get clean Markdown
82
+
83
+ Powered by **markitdown** β€’ Works best with text-heavy documents
84
+ """)
85
+
86
+ with gr.Row():
87
+ with gr.Column(scale=4):
88
+ file_input = gr.File(
89
+ label="Upload document",
90
+ file_count="single",
91
+ file_types=[
92
+ ".pdf", ".doc", ".docx", ".ppt", ".pptx",
93
+ ".xls", ".xlsx", ".odt", ".rtf", ".txt", ".md"
94
+ ],
95
+ elem_classes="upload-box",
96
+ max_size=f"{MAX_FILE_SIZE_MB}MB"
97
+ )
98
+
99
+ with gr.Column(scale=1, min_width=180):
100
+ convert_btn = gr.Button("Convert to Markdown", variant="primary", scale=1)
101
+
102
+ status_output = gr.Textbox(
103
+ label="Status",
104
+ lines=3,
105
+ interactive=False,
106
+ show_copy_button=False
107
+ )
108
 
109
+ markdown_output = gr.Markdown(
110
+ label="Converted Markdown",
111
+ height=500
 
 
 
112
  )
113
 
114
  with gr.Row():
115
+ download_file = gr.File(
116
+ label="Download markdown file",
117
+ file_types=[".md"],
118
+ interactive=False
 
119
  )
120
+ download_name = gr.Textbox(
121
+ label="Suggested filename",
122
+ value="document.md",
123
+ interactive=True,
124
+ max_lines=1
125
  )
126
 
127
+ # ─── Main action ───────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  convert_btn.click(
129
+ fn=convert_to_markdown,
130
+ inputs=file_input,
131
+ outputs=[status_output, markdown_output, download_name]
132
+ ).then(
133
+ fn=lambda md_text, fname: gr.File(value=md_text, filename=fname) if md_text else None,
134
+ inputs=[markdown_output, download_name],
135
+ outputs=download_file
 
 
 
136
  )
137
 
138
+ gr.Markdown("""
139
+ ### Notes
140
+ - Maximum file size: ~{MAX_FILE_SIZE_MB} MB (Hugging Face free tier limit is usually higher)
141
+ - Best results with text-oriented documents
142
+ - Tables, images, complex layouts may be simplified
143
+ - Very large documents may take longer or timeout
144
+ """)
145
 
146
  if __name__ == "__main__":
147
+ demo.launch()