dindizz commited on
Commit
392d77e
·
verified ·
1 Parent(s): 370754a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -14
app.py CHANGED
@@ -1,23 +1,42 @@
1
  import json
 
2
  from docling.document_converter import DocumentConverter
3
  import gradio as gr
 
4
 
5
  def pdf_to_json(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  converter = DocumentConverter()
7
- result = converter.convert(url)
8
-
9
  try:
10
- # Attempt direct JSON export
11
- json_output = result.document.export_to_json()
12
- except AttributeError:
13
- # Construct JSON if direct export is unavailable
14
- content = {
15
- "title": result.document.title if hasattr(result.document, 'title') else "Untitled",
16
- "sections": [section.text for section in result.document.sections]
17
- }
18
- json_output = json.dumps(content, indent=2)
19
-
20
- return json_output
 
 
 
 
 
 
21
 
22
  # Gradio interface
23
  iface = gr.Interface(
@@ -28,4 +47,4 @@ iface = gr.Interface(
28
  description="Convert a PDF from a URL to JSON format."
29
  )
30
 
31
- iface.launch()
 
1
  import json
2
+ import requests
3
  from docling.document_converter import DocumentConverter
4
  import gradio as gr
5
+ import tempfile
6
 
7
  def pdf_to_json(url):
8
+ # Download the PDF file from the URL
9
+ try:
10
+ response = requests.get(url)
11
+ response.raise_for_status()
12
+ except requests.exceptions.RequestException as e:
13
+ return f"Error downloading PDF: {e}"
14
+
15
+ # Save the PDF to a temporary file
16
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
17
+ temp_pdf.write(response.content)
18
+ temp_pdf_path = temp_pdf.name
19
+
20
+ # Initialize the DocumentConverter
21
  converter = DocumentConverter()
 
 
22
  try:
23
+ # Convert the downloaded PDF file
24
+ result = converter.convert(temp_pdf_path)
25
+
26
+ # Try direct JSON export if supported
27
+ try:
28
+ json_output = result.document.export_to_json()
29
+ except AttributeError:
30
+ # Construct JSON manually if export_to_json() is not available
31
+ content = {
32
+ "title": result.document.title if hasattr(result.document, 'title') else "Untitled",
33
+ "sections": [section.text for section in result.document.sections]
34
+ }
35
+ json_output = json.dumps(content, indent=2)
36
+
37
+ return json_output
38
+ except Exception as e:
39
+ return f"Error processing PDF: {e}"
40
 
41
  # Gradio interface
42
  iface = gr.Interface(
 
47
  description="Convert a PDF from a URL to JSON format."
48
  )
49
 
50
+ iface.launch(share=True)