qtpi commited on
Commit
a033fd2
·
verified ·
1 Parent(s): ad7b5c2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import json
3
+ import os
4
+ import gradio as gr
5
+
6
+ # Set JAVA_HOME environment variable
7
+ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
8
+
9
+ def grade_pdf(input_pdf, pdffigures2_jar):
10
+ """
11
+ Process a PDF file using pdffigures2 and return the extracted JSON output.
12
+
13
+ Args:
14
+ input_pdf (str): Path to the input PDF file.
15
+ output_dir (str): Path to the directory where JSON output will be saved.
16
+ pdffigures2_jar (str): Path to the pdffigures2 JAR file.
17
+
18
+ Returns:
19
+ dict: Parsed JSON data extracted by pdffigures2.
20
+
21
+ Raises:
22
+ FileNotFoundError: If the input PDF or JAR file does not exist.
23
+ Exception: If the command fails or the JSON output is not generated.
24
+ """
25
+ # Check if input file and JAR file exist
26
+ if not os.path.exists(input_pdf):
27
+ raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
28
+ if not os.path.exists(pdffigures2_jar):
29
+ raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
30
+
31
+
32
+ # Command to execute pdffigures2
33
+ command = [
34
+ "java", "-jar", pdffigures2_jar,
35
+ input_pdf,
36
+ "-d", '_'
37
+ ]
38
+ print(input_pdf)
39
+
40
+ try:
41
+ # Run the command
42
+ subprocess.run(command, capture_output=True, text=True, check=True)
43
+
44
+ # Construct the output JSON file path
45
+ # print()
46
+ output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
47
+
48
+ # Read and return the JSON data
49
+ if os.path.exists(output_json_file):
50
+ with open(output_json_file, "r") as f:
51
+ return json.load(f)
52
+ else:
53
+ raise Exception(f"JSON output file not found: {output_json_file}")
54
+
55
+ except subprocess.CalledProcessError as e:
56
+ raise Exception(f"Error while running pdffigures2: {e.stderr}")
57
+
58
+ def process_file(file):
59
+ """
60
+ Wrapper to process a file via grade_pdf and return JSON data.
61
+
62
+ Args:
63
+ file (str): Path to the uploaded PDF file.
64
+
65
+ Returns:
66
+ str: JSON string of extracted data.
67
+ """
68
+ # Define paths
69
+ pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
70
+
71
+ try:
72
+ # Process the PDF and get JSON data
73
+ # print(file.name)
74
+ extracted_data = grade_pdf(file.name, pdffigures2_jar_path)
75
+
76
+
77
+ # Return JSON as a formatted string
78
+ return json.dumps(extracted_data, indent=4)
79
+ except Exception as e:
80
+ return f"An error occurred: {e}"
81
+
82
+ # Gradio Interface
83
+ def gradio_interface():
84
+ interface = gr.Interface(
85
+ fn=process_file,
86
+ inputs=gr.File(label="Upload PDF File"),
87
+ outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
88
+ title="PDF Grading Interface",
89
+ description="Upload a PDF file, and this tool will extract figures and captions as JSON."
90
+ )
91
+ return interface
92
+
93
+ if __name__ == "__main__":
94
+ # Launch the Gradio app
95
+ gradio_interface().launch()