rbughao commited on
Commit
d009d38
·
verified ·
1 Parent(s): 4946c48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -29
app.py CHANGED
@@ -1,56 +1,136 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
- import docx
4
- from markdownify import markdownify as md
5
- import os
6
  import tempfile
 
 
7
 
8
- def extract_text_from_pdf(file_path):
9
- text = ""
10
  doc = fitz.open(file_path)
 
 
11
  for page in doc:
12
- text += page.get_text()
13
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- def extract_text_from_docx(file_path):
16
- doc = docx.Document(file_path)
17
- return "\n".join([p.text for p in doc.paragraphs])
 
 
 
18
 
19
- def convert_to_markdown(uploaded_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  if uploaded_file is None:
21
- return None
22
 
23
  file_path = uploaded_file.name
24
  ext = os.path.splitext(file_path)[1].lower()
25
 
26
- if ext == ".pdf":
27
- text = extract_text_from_pdf(file_path)
28
- elif ext == ".docx":
29
- text = extract_text_from_docx(file_path)
30
  else:
31
- return None
32
-
33
- markdown_text = md(text)
34
 
35
- temp_md = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
36
- with open(temp_md.name, "w", encoding="utf-8") as f:
37
  f.write(markdown_text)
38
 
39
- return temp_md.name
 
40
 
 
41
  with gr.Blocks() as demo:
42
- gr.Markdown("# 📄➡️📝 Document to Markdown Converter")
43
- gr.Markdown("Upload a **PDF or Word (.docx)** file and get a **Markdown (.md)** file.")
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- file_input = gr.File(label="Upload PDF or DOCX", file_types=[".pdf", ".docx"])
46
- output_file = gr.File(label="Download Markdown File")
47
 
48
- convert_btn = gr.Button("Convert to Markdown")
 
 
49
 
50
  convert_btn.click(
51
- fn=convert_to_markdown,
52
  inputs=file_input,
53
- outputs=output_file
54
  )
55
 
56
  demo.launch()
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
+ from docx import Document
 
 
4
  import tempfile
5
+ import os
6
+
7
 
8
+ # ---------- PDF (best-effort) ----------
9
+ def pdf_to_markdown(file_path):
10
  doc = fitz.open(file_path)
11
+ lines = []
12
+
13
  for page in doc:
14
+ text = page.get_text("text")
15
+ for line in text.split("\n"):
16
+ line = line.strip()
17
+ if line:
18
+ lines.append(line)
19
+
20
+ return "\n\n".join(lines)
21
+
22
+
23
+ # ---------- DOCX (structure-aware) ----------
24
+ def docx_to_markdown(file_path):
25
+ doc = Document(file_path)
26
+ md = []
27
+
28
+ for para in doc.paragraphs:
29
+ text = para.text.strip()
30
+
31
+ if not text:
32
+ md.append("")
33
+ continue
34
+
35
+ # Headings
36
+ if para.style.name.startswith("Heading"):
37
+ try:
38
+ level = int(para.style.name.replace("Heading", ""))
39
+ except:
40
+ level = 1
41
+ md.append("#" * level + " " + text)
42
+ continue
43
+
44
+ # Bullet lists
45
+ if "List Bullet" in para.style.name:
46
+ md.append(f"- {text}")
47
+ continue
48
+
49
+ # Numbered lists
50
+ if "List Number" in para.style.name:
51
+ md.append(f"1. {text}")
52
+ continue
53
 
54
+ # Inline formatting
55
+ formatted = ""
56
+ for run in para.runs:
57
+ run_text = run.text
58
+ if not run_text:
59
+ continue
60
 
61
+ if run.bold and run.italic:
62
+ run_text = f"***{run_text}***"
63
+ elif run.bold:
64
+ run_text = f"**{run_text}**"
65
+ elif run.italic:
66
+ run_text = f"*{run_text}*"
67
+
68
+ formatted += run_text
69
+
70
+ md.append(formatted)
71
+
72
+ # Tables
73
+ for table in doc.tables:
74
+ md.append("")
75
+ headers = [cell.text.strip() for cell in table.rows[0].cells]
76
+ md.append("| " + " | ".join(headers) + " |")
77
+ md.append("| " + " | ".join(["---"] * len(headers)) + " |")
78
+
79
+ for row in table.rows[1:]:
80
+ cells = [cell.text.strip() for cell in row.cells]
81
+ md.append("| " + " | ".join(cells) + " |")
82
+
83
+ return "\n".join(md)
84
+
85
+
86
+ # ---------- Main handler ----------
87
+ def convert_file(uploaded_file):
88
  if uploaded_file is None:
89
+ return "", None
90
 
91
  file_path = uploaded_file.name
92
  ext = os.path.splitext(file_path)[1].lower()
93
 
94
+ if ext == ".docx":
95
+ markdown_text = docx_to_markdown(file_path)
96
+ elif ext == ".pdf":
97
+ markdown_text = pdf_to_markdown(file_path)
98
  else:
99
+ return "Unsupported file type.", None
 
 
100
 
101
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
102
+ with open(tmp.name, "w", encoding="utf-8") as f:
103
  f.write(markdown_text)
104
 
105
+ return markdown_text, tmp.name
106
+
107
 
108
+ # ---------- UI ----------
109
  with gr.Blocks() as demo:
110
+ gr.Markdown("# 📄➡️📝 Document Markdown Converter")
111
+ gr.Markdown(
112
+ """
113
+ **DOCX:** High-quality Markdown (headings, lists, bold, italics, tables)
114
+ **PDF:** Best-effort text conversion (PDFs don’t store structure)
115
+ """
116
+ )
117
+
118
+ with gr.Row():
119
+ file_input = gr.File(
120
+ label="Upload PDF or DOCX",
121
+ file_types=[".pdf", ".docx"]
122
+ )
123
 
124
+ convert_btn = gr.Button("Convert")
 
125
 
126
+ with gr.Row():
127
+ md_preview = gr.Markdown(label="Live Markdown Preview")
128
+ md_download = gr.File(label="Download .md file")
129
 
130
  convert_btn.click(
131
+ fn=convert_file,
132
  inputs=file_input,
133
+ outputs=[md_preview, md_download]
134
  )
135
 
136
  demo.launch()