cryogenic22 commited on
Commit
88954d2
·
verified ·
1 Parent(s): 6c6df07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -27
app.py CHANGED
@@ -1,36 +1,58 @@
1
  import streamlit as st
2
  from docling.parsers import DOCXParser, PDFParser, PPTXParser
3
  from docling.utils import to_json, to_markdown
 
 
4
 
5
- st.title("Docling Document Processor")
 
 
 
 
 
 
6
 
7
- # File uploader
8
- uploaded_file = st.file_uploader("Choose a document (PDF, DOCX, PPTX)", type=["pdf", "docx", "pptx"])
9
-
10
- if uploaded_file is not None:
11
- # Determine file type and parse
12
- file_extension = uploaded_file.name.split(".")[-1].lower()
13
- if file_extension == "pdf":
14
- parser = PDFParser()
15
- elif file_extension == "docx":
16
- parser = DOCXParser()
17
- elif file_extension == "pptx":
18
- parser = PPTXParser()
19
- else:
20
- st.error("Unsupported file type.")
21
-
22
- # Parse the document
23
  try:
24
- document = parser.parse(uploaded_file)
 
25
  except Exception as e:
26
- st.error(f"Error parsing document: {e}")
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Display output options
29
- output_format = st.radio("Select output format:", ("Markdown", "JSON"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- if output_format == "Markdown":
32
- st.subheader("Markdown Output:")
33
- st.write(to_markdown(document))
34
- else:
35
- st.subheader("JSON Output:")
36
- st.json(to_json(document))
 
1
  import streamlit as st
2
  from docling.parsers import DOCXParser, PDFParser, PPTXParser
3
  from docling.utils import to_json, to_markdown
4
+ import sys
5
+ import traceback
6
 
7
+ def init_parsers():
8
+ parsers = {
9
+ "pdf": PDFParser(),
10
+ "docx": DOCXParser(),
11
+ "pptx": PPTXParser()
12
+ }
13
+ return parsers
14
 
15
+ def process_document(file, parser):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
+ document = parser.parse(file)
18
+ return document, None
19
  except Exception as e:
20
+ error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
21
+ return None, error_msg
22
+
23
+ def main():
24
+ st.set_page_config(page_title="Document Processor")
25
+ st.title("Docling Document Processor")
26
+
27
+ try:
28
+ parsers = init_parsers()
29
+ except ImportError as e:
30
+ st.error("Error: Required packages not installed. Please run: pip install docling python-docx pdfminer.six python-pptx")
31
+ return
32
 
33
+ uploaded_file = st.file_uploader("Choose a document", type=list(parsers.keys()))
34
+
35
+ if uploaded_file:
36
+ file_extension = uploaded_file.name.split(".")[-1].lower()
37
+ parser = parsers.get(file_extension)
38
+
39
+ document, error = process_document(uploaded_file, parser)
40
+
41
+ if error:
42
+ st.error(f"Failed to parse document:\n{error}")
43
+ return
44
+
45
+ output_format = st.radio("Select output format:", ("Markdown", "JSON"))
46
+
47
+ try:
48
+ if output_format == "Markdown":
49
+ st.subheader("Markdown Output:")
50
+ st.markdown(to_markdown(document))
51
+ else:
52
+ st.subheader("JSON Output:")
53
+ st.json(to_json(document))
54
+ except Exception as e:
55
+ st.error(f"Error converting document: {str(e)}")
56
 
57
+ if __name__ == "__main__":
58
+ main()