rbughao commited on
Commit
fe22f31
·
verified ·
1 Parent(s): 7f7bfa6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -17
app.py CHANGED
@@ -8,47 +8,49 @@ from markitdown import MarkItDown
8
  md = MarkItDown()
9
 
10
  def convert_file(file, output_format):
11
- """
12
- Convert the uploaded file using MarkItDown and return the extracted text,
13
- plus a downloadable file in the chosen format.
14
- """
15
  if file is None:
16
  return gr.update(value="Please upload a file."), None
17
 
18
  try:
19
- # MarkItDown accepts a path-like string; gradio gives a temp file object
20
  result = md.convert(file.name)
21
 
22
- # Prefer text_content; fall back to markdown_content if available
23
  text = getattr(result, "text_content", None)
24
  if not text:
25
  text = getattr(result, "markdown_content", "")
26
  if not text:
27
  text = "No textual content extracted."
28
 
29
- # Build downloadable content
30
  base = os.path.splitext(os.path.basename(file.name))[0]
31
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
32
 
33
  if output_format == "markdown":
34
  out_name = f"{base}_extracted_{timestamp}.md"
35
- bytes_io = io.BytesIO(text.encode("utf-8"))
36
  else:
37
  out_name = f"{base}_extracted_{timestamp}.txt"
38
- bytes_io = io.BytesIO(text.encode("utf-8"))
39
 
 
40
  bytes_io.seek(0)
41
  return text, (out_name, bytes_io)
42
 
43
  except Exception as e:
44
- # Be user-friendly but retain the message for debugging in the UI
45
  return f"❌ Conversion failed: {e}", None
46
 
 
 
 
 
 
 
 
 
 
 
 
47
  with gr.Blocks(title="MarkItDown - Document Extractor") as demo:
48
  gr.Markdown(
49
  """
50
  # 📝 MarkItDown – Document Text Extractor
51
- Upload a **PDF, DOCX, PPTX, EML, HTML**, or similar file and extract clean text using [MarkItDown](https://github.com/microsoft/markitdown).
52
  """
53
  )
54
 
@@ -68,11 +70,11 @@ with gr.Blocks(title="MarkItDown - Document Extractor") as demo:
68
  with gr.Row():
69
  convert_btn = gr.Button("Convert", variant="primary")
70
  clear_btn = gr.Button("Clear")
 
71
 
72
  text_output = gr.Textbox(
73
  label="Extracted Text",
74
- lines=20,
75
- show_copy_button=True
76
  )
77
  download_file = gr.File(
78
  label="Download Extracted File",
@@ -86,15 +88,15 @@ with gr.Blocks(title="MarkItDown - Document Extractor") as demo:
86
  api_name="convert"
87
  )
88
 
89
- def clear():
90
- return None, "",""
91
-
92
  clear_btn.click(
93
  fn=lambda: (None, "", None),
94
  inputs=[],
95
  outputs=[file_input, text_output, download_file]
96
  )
97
 
 
 
 
98
  if __name__ == "__main__":
99
- # Spaces will call `demo.launch()` automatically, but this helps local runs.
100
  demo.launch()
 
 
8
  md = MarkItDown()
9
 
10
  def convert_file(file, output_format):
 
 
 
 
11
  if file is None:
12
  return gr.update(value="Please upload a file."), None
13
 
14
  try:
 
15
  result = md.convert(file.name)
16
 
 
17
  text = getattr(result, "text_content", None)
18
  if not text:
19
  text = getattr(result, "markdown_content", "")
20
  if not text:
21
  text = "No textual content extracted."
22
 
 
23
  base = os.path.splitext(os.path.basename(file.name))[0]
24
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
25
 
26
  if output_format == "markdown":
27
  out_name = f"{base}_extracted_{timestamp}.md"
 
28
  else:
29
  out_name = f"{base}_extracted_{timestamp}.txt"
 
30
 
31
+ bytes_io = io.BytesIO(text.encode("utf-8"))
32
  bytes_io.seek(0)
33
  return text, (out_name, bytes_io)
34
 
35
  except Exception as e:
 
36
  return f"❌ Conversion failed: {e}", None
37
 
38
+
39
+ copy_js = """
40
+ () => {
41
+ const tb = document.querySelector('textarea');
42
+ if (!tb) { alert('Nothing to copy'); return; }
43
+ tb.select();
44
+ document.execCommand('copy');
45
+ alert('Copied to clipboard');
46
+ }
47
+ """
48
+
49
  with gr.Blocks(title="MarkItDown - Document Extractor") as demo:
50
  gr.Markdown(
51
  """
52
  # 📝 MarkItDown – Document Text Extractor
53
+ Upload a **PDF, DOCX, PPTX, EML, HTML**, or similar file and extract clean text using https://github.com/microsoft/markitdown.
54
  """
55
  )
56
 
 
70
  with gr.Row():
71
  convert_btn = gr.Button("Convert", variant="primary")
72
  clear_btn = gr.Button("Clear")
73
+ copy_btn = gr.Button("Copy Text")
74
 
75
  text_output = gr.Textbox(
76
  label="Extracted Text",
77
+ lines=20
 
78
  )
79
  download_file = gr.File(
80
  label="Download Extracted File",
 
88
  api_name="convert"
89
  )
90
 
 
 
 
91
  clear_btn.click(
92
  fn=lambda: (None, "", None),
93
  inputs=[],
94
  outputs=[file_input, text_output, download_file]
95
  )
96
 
97
+ # Client-side copy to clipboard
98
+ copy_btn.click(None, [], [], _js=copy_js)
99
+
100
  if __name__ == "__main__":
 
101
  demo.launch()
102
+