rbughao commited on
Commit
f13a7e1
·
verified ·
1 Parent(s): ff01211

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -127
app.py CHANGED
@@ -1,146 +1,56 @@
1
- # app.py
2
-
3
  import gradio as gr
4
- from markitdown import MarkItDown
5
- from pathlib import Path
6
- import tempfile
7
  import os
8
- import traceback
9
-
10
- # Suggested limit — adjust as needed (Hugging Face Spaces free tier often allows 500 MB+)
11
- MAX_FILE_SIZE_STR = "100MB" # or "500MB", "2GB", etc.
12
-
13
- def convert_to_markdown(file_obj):
14
- if file_obj is None:
15
- return "Please upload a document.", ""
16
-
17
- try:
18
- original_name = Path(file_obj.name).stem
19
- ext = Path(file_obj.name).suffix.lower()
20
-
21
- allowed_extensions = [
22
- '.pdf', '.doc', '.docx', '.ppt', '.pptx',
23
- '.xls', '.xlsx', '.odt', '.rtf', '.txt', '.md'
24
- ]
25
-
26
- if ext not in allowed_extensions:
27
- return (
28
- f"Unsupported file format: {ext}\n\n"
29
- f"Supported: {', '.join(allowed_extensions)}",
30
- ""
31
- )
32
-
33
- # Write uploaded content to temporary file
34
- with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
35
- # Important: file_obj is a NamedTemporaryFile already in recent Gradio → read() works
36
- tmp.write(file_obj.read())
37
- tmp_path = tmp.name
38
-
39
- try:
40
- md = MarkItDown()
41
- result = md.convert(tmp_path)
42
- markdown_text = result.text_content
43
-
44
- suggested_filename = f"{original_name}.md"
45
 
46
- return (
47
- f"**Conversion successful!** ✓ \nFile: {original_name}{ext}",
48
- markdown_text,
49
- suggested_filename
50
- )
 
51
 
52
- finally:
53
- try:
54
- os.unlink(tmp_path)
55
- except:
56
- pass
57
 
58
- except Exception as e:
59
- error_detail = traceback.format_exc()
60
- return f"**Error during conversion:**\n{str(e)}\n\n{error_detail[:1000]}…", "", ""
61
 
 
 
62
 
63
- css = """
64
- .upload-box {border: 1px solid #ccc; border-radius: 8px; padding: 16px; background: #fafafa;}
65
- .success {color: #2e7d32;}
66
- .error {color: #c62828;}
67
- """
 
68
 
69
- with gr.Blocks(title="Document → Markdown Converter") as demo: # ← removed css= here
70
 
71
- gr.Markdown("""
72
- # Document Markdown Converter
 
73
 
74
- Upload PDF, Word, PowerPoint, Excel, etc. → clean Markdown
75
 
76
- Powered by **markitdown** • Best for text-heavy documents
77
- """)
 
78
 
79
- with gr.Row():
80
- with gr.Column(scale=4):
81
- file_input = gr.File(
82
- label="Upload document",
83
- file_count="single",
84
- file_types=[
85
- ".pdf", ".doc", ".docx", ".ppt", ".pptx",
86
- ".xls", ".xlsx", ".odt", ".rtf", ".txt", ".md"
87
- ],
88
- elem_classes="upload-box",
89
- # NO max_size here — it was never valid
90
- )
91
 
92
- with gr.Column(scale=1, min_width=180):
93
- convert_btn = gr.Button("Convert to Markdown", variant="primary")
94
 
95
- status_output = gr.Textbox(
96
- label="Status",
97
- lines=3,
98
- interactive=False,
99
- show_copy_button=False
100
- )
101
-
102
- markdown_output = gr.Markdown(
103
- label="Converted Markdown",
104
- height=500
105
- )
106
-
107
- with gr.Row():
108
- download_file = gr.File(
109
- label="Download markdown file",
110
- file_types=[".md"],
111
- interactive=False
112
- )
113
- download_name = gr.Textbox(
114
- label="Suggested filename",
115
- value="document.md",
116
- interactive=True,
117
- max_lines=1
118
- )
119
-
120
- # Main action
121
  convert_btn.click(
122
  fn=convert_to_markdown,
123
  inputs=file_input,
124
- outputs=[status_output, markdown_output, download_name]
125
- ).then(
126
- fn=lambda md_text, fname: gr.File(value=md_text, filename=fname) if md_text else None,
127
- inputs=[markdown_output, download_name],
128
- outputs=download_file
129
  )
130
 
131
- gr.Markdown(f"""
132
- ### Notes
133
- - Maximum file size: **{MAX_FILE_SIZE_STR}** (set via launch parameter)
134
- - Best with text-oriented documents
135
- - Tables / images / layouts may be simplified
136
- - Very large files may timeout or fail
137
- """)
138
-
139
-
140
- if __name__ == "__main__":
141
- demo.launch(
142
- max_file_size=MAX_FILE_SIZE_STR, # ← This is where the limit goes!
143
- # Optional: in_spaces you often want these too
144
- # server_name="0.0.0.0",
145
- # server_port=7860,
146
- )
 
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import docx
4
+ from markdownify import markdownify as md
5
  import os
6
+ import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ def extract_text_from_pdf(file_path):
9
+ text = ""
10
+ doc = fitz.open(file_path)
11
+ for page in doc:
12
+ text += page.get_text()
13
+ return text
14
 
15
+ def extract_text_from_docx(file_path):
16
+ doc = docx.Document(file_path)
17
+ return "\n".join([p.text for p in doc.paragraphs])
 
 
18
 
19
+ def convert_to_markdown(uploaded_file):
20
+ if uploaded_file is None:
21
+ return None
22
 
23
+ file_path = uploaded_file.name
24
+ ext = os.path.splitext(file_path)[1].lower()
25
 
26
+ if ext == ".pdf":
27
+ text = extract_text_from_pdf(file_path)
28
+ elif ext == ".docx":
29
+ text = extract_text_from_docx(file_path)
30
+ else:
31
+ return None
32
 
33
+ markdown_text = md(text)
34
 
35
+ temp_md = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
36
+ with open(temp_md.name, "w", encoding="utf-8") as f:
37
+ f.write(markdown_text)
38
 
39
+ return temp_md.name
40
 
41
+ with gr.Blocks() as demo:
42
+ gr.Markdown("# 📄➡️📝 Document to Markdown Converter")
43
+ gr.Markdown("Upload a **PDF or Word (.docx)** file and get a **Markdown (.md)** file.")
44
 
45
+ file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
46
+ output_file = gr.File(label="Download Markdown File")
 
 
 
 
 
 
 
 
 
 
47
 
48
+ convert_btn = gr.Button("Convert to Markdown")
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  convert_btn.click(
51
  fn=convert_to_markdown,
52
  inputs=file_input,
53
+ outputs=output_file
 
 
 
 
54
  )
55
 
56
+ demo.launch()