qtpi commited on
Commit
1fe9f7c
·
verified ·
1 Parent(s): 0068cc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -71
app.py CHANGED
@@ -3,49 +3,55 @@ import json
3
  import os
4
  import gradio as gr
5
  import tempfile
 
 
6
 
7
- # Set JAVA_HOME environment variable
8
- # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
9
 
10
- def grade_pdf(input_pdf, pdffigures2_jar):
11
  """
12
  Process a PDF file using pdffigures2 and return the extracted JSON output.
13
-
14
  Args:
15
- input_pdf (str): Path to the input PDF file.
16
- output_dir (str): Path to the directory where JSON output will be saved.
17
  pdffigures2_jar (str): Path to the pdffigures2 JAR file.
18
-
19
  Returns:
20
- dict: Parsed JSON data extracted by pdffigures2.
21
-
22
  Raises:
23
  FileNotFoundError: If the input PDF or JAR file does not exist.
24
  Exception: If the command fails or the JSON output is not generated.
25
  """
26
  # Check if input file and JAR file exist
27
- if not os.path.exists(input_pdf):
28
- raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
29
  if not os.path.exists(pdffigures2_jar):
30
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
31
 
 
 
 
 
 
 
 
 
32
 
33
- # Command to execute pdffigures2
34
- command = [
35
- "java", "-jar", pdffigures2_jar,
36
- input_pdf,
37
- "-d", '_'
38
- ]
39
- print(input_pdf)
 
40
 
41
- try:
42
  # Run the command
43
- subprocess.run(command, capture_output=True, text=True, check=True)
 
 
 
 
44
 
45
- # Construct the output JSON file path
46
- # print()
47
- output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
48
- print("output_json_file: ", output_json_file)
49
 
50
  # Read and return the JSON data
51
  if os.path.exists(output_json_file):
@@ -55,81 +61,207 @@ def grade_pdf(input_pdf, pdffigures2_jar):
55
  issueList = list()
56
 
57
  for region in regions:
58
- issues = dict()
59
- if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
60
- region["captionLocation"]="Below"
 
 
 
 
 
 
61
  else:
62
- region["captionLocation"]="Above"
63
-
64
- if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
65
- issues["page_number"]=region["page"]+1
66
- issues["caption_coordinate"]=region["captionBoundary"]
67
- issues["fig_type"] = region["figType"]
68
- issues["caption_location"] = region["captionLocation"]
69
- issues["description"]="Location of the caption for tables must be above the table."
70
- issueList.append(issues)
71
-
72
- if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
73
- issues["page_number"]=region["page"]+1
74
- issues["caption_coordinate"]=region["captionBoundary"]
75
- issues["fig_type"] = region["figType"]
76
- issues["description"]="Location of the caption for figures must be below the table."
 
 
 
 
77
  issueList.append(issues)
78
-
 
 
 
 
 
 
 
 
 
 
 
79
  return issueList
80
 
81
  else:
82
- raise Exception(f"JSON output file not found: {output_json_file}")
 
83
 
84
  except subprocess.CalledProcessError as e:
85
- raise Exception(f"Error while running pdffigures2: {e.stderr}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- def process_file(file):
88
  """
89
- Wrapper to process a file via grade_pdf and return JSON data.
 
90
 
91
  Args:
92
- file (str): Path to the uploaded PDF file.
93
 
94
  Returns:
95
- str: JSON string of extracted data.
96
  """
97
- # Define paths
98
- pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
99
-
100
- temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
101
- temp_input.write(file)
102
- temp_input_path = temp_input.name
103
- print("Path of the input file: ",temp_input_path)
 
 
 
 
 
 
 
104
 
105
  try:
106
- # Process the PDF and get JSON data
107
- # print(file.name)
 
 
 
 
 
 
 
 
 
108
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
109
- # Clean up the temporary input file
110
- os.unlink(temp_input_path)
111
-
112
 
113
  # Return JSON as a formatted string
114
  return json.dumps(extracted_data, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  except Exception as e:
116
- return f"An error occurred: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # Gradio Interface
119
  def gradio_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  interface = gr.Interface(
121
  fn=process_file,
122
-
123
- inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
124
-
125
- outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
126
-
127
- title="PDF Grading Interface",
128
-
129
- description="Upload a PDF file, and this tool will extract figures and captions as JSON."
 
 
 
 
 
 
130
  )
131
  return interface
132
 
133
  if __name__ == "__main__":
134
  # Launch the Gradio app
135
- gradio_interface().launch()
 
 
 
3
  import os
4
  import gradio as gr
5
  import tempfile
6
+ import shutil # Needed for rmtree
7
+ import traceback # For detailed error logging
8
 
9
+ # Set JAVA_HOME environment variable (Uncomment and set if needed)
10
+ # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64' # Example path
11
 
12
+ def grade_pdf(input_pdf_path, pdffigures2_jar):
13
  """
14
  Process a PDF file using pdffigures2 and return the extracted JSON output.
 
15
  Args:
16
+ input_pdf_path (str): Path to the input PDF file.
 
17
  pdffigures2_jar (str): Path to the pdffigures2 JAR file.
 
18
  Returns:
19
+ list: List of issues found based on figure/table caption rules.
 
20
  Raises:
21
  FileNotFoundError: If the input PDF or JAR file does not exist.
22
  Exception: If the command fails or the JSON output is not generated.
23
  """
24
  # Check if input file and JAR file exist
25
+ if not os.path.exists(input_pdf_path):
26
+ raise FileNotFoundError(f"Input PDF file not found: {input_pdf_path}")
27
  if not os.path.exists(pdffigures2_jar):
28
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
29
 
30
+ temp_output_dir = None # Initialize
31
+ try:
32
+ # Create a temporary directory for the output JSON
33
+ temp_output_dir = tempfile.mkdtemp()
34
+ # Construct the expected output JSON file path INSIDE the temp directory
35
+ # pdffigures2 creates a JSON named after the input file inside the dir specified by -d
36
+ output_json_file = os.path.join(temp_output_dir, os.path.splitext(os.path.basename(input_pdf_path))[0] + ".json")
37
+ print(f"Expecting output JSON at: {output_json_file}")
38
 
39
+ # Command to execute pdffigures2
40
+ # Use -d to specify the *directory* for the JSON output
41
+ command = [
42
+ "java", "-jar", pdffigures2_jar,
43
+ input_pdf_path,
44
+ "-d", temp_output_dir # Output JSON to the temp directory
45
+ ]
46
+ print(f"Running command: {' '.join(command)}")
47
 
 
48
  # Run the command
49
+ result = subprocess.run(command, capture_output=True, text=True, check=True, timeout=60) # Added timeout
50
+ print("pdffigures2 stdout:", result.stdout)
51
+ # Stderr might contain useful info even on success for pdffigures2
52
+ if result.stderr:
53
+ print("pdffigures2 stderr:", result.stderr)
54
 
 
 
 
 
55
 
56
  # Read and return the JSON data
57
  if os.path.exists(output_json_file):
 
61
  issueList = list()
62
 
63
  for region in regions:
64
+ # Determine caption location
65
+ caption_loc = "Above" # Default
66
+ # Ensure keys exist before accessing
67
+ if "captionBoundary" in region and "regionBoundary" in region and \
68
+ isinstance(region["captionBoundary"], dict) and isinstance(region["regionBoundary"], dict) and \
69
+ "y1" in region["captionBoundary"] and "y1" in region["regionBoundary"]:
70
+
71
+ if(region["captionBoundary"]["y1"] > region["regionBoundary"]["y1"]):
72
+ caption_loc = "Below"
73
  else:
74
+ print(f"Warning: Missing boundary information for a region on page {region.get('page', 'N/A')}. Skipping location check.")
75
+ continue # Skip this region if boundaries are missing
76
+
77
+ region["captionLocation"] = caption_loc # Store for potential use/debugging
78
+
79
+ fig_type = region.get("figType") # Use .get for safety
80
+ page_num = region.get("page", -1) + 1 # Use .get and handle potential missing key
81
+
82
+ issues = {} # Initialize for this specific issue potential
83
+
84
+ # Check for Table caption below
85
+ if caption_loc == "Below" and fig_type == "Table":
86
+ issues = {
87
+ "page_number": page_num,
88
+ "caption_coordinate": region.get("captionBoundary"),
89
+ "fig_type": fig_type,
90
+ "caption_location": caption_loc,
91
+ "description": "Location of the caption for tables must be above the table."
92
+ }
93
  issueList.append(issues)
94
+
95
+ # Check for Figure caption above
96
+ elif caption_loc == "Above" and fig_type == "Figure":
97
+ issues = {
98
+ "page_number": page_num,
99
+ "caption_coordinate": region.get("captionBoundary"),
100
+ "fig_type": fig_type,
101
+ "caption_location": caption_loc,
102
+ "description": "Location of the caption for figures must be below the figure."
103
+ }
104
+ issueList.append(issues)
105
+
106
  return issueList
107
 
108
  else:
109
+ # If JSON not found, include stderr from the process
110
+ raise Exception(f"JSON output file not found: {output_json_file}. pdffigures2 stderr: {result.stderr if result else 'No result object'}")
111
 
112
  except subprocess.CalledProcessError as e:
113
+ # Include command output/error in the exception
114
+ raise Exception(f"Error while running pdffigures2 (return code {e.returncode}):\nSTDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}")
115
+ except subprocess.TimeoutExpired as e:
116
+ raise Exception(f"pdffigures2 command timed out after {e.timeout} seconds.\nSTDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}")
117
+ except Exception as e:
118
+ # Re-raise other exceptions for clarity
119
+ print(f"An unexpected error occurred in grade_pdf:")
120
+ traceback.print_exc() # Print detailed traceback for debugging
121
+ raise e # Re-raise the original exception
122
+ finally:
123
+ # Clean up the temporary output directory and its contents
124
+ if temp_output_dir and os.path.exists(temp_output_dir):
125
+ try:
126
+ shutil.rmtree(temp_output_dir)
127
+ print(f"Cleaned up temporary output directory: {temp_output_dir}")
128
+ except Exception as cleanup_error:
129
+ print(f"Warning: Failed to cleanup temporary output directory {temp_output_dir}: {cleanup_error}")
130
+
131
 
132
+ def process_file(file_bytes: bytes): # Expect raw bytes
133
  """
134
+ Wrapper to process PDF bytes received from Gradio client via grade_pdf
135
+ and return JSON data.
136
 
137
  Args:
138
+ file_bytes (bytes): The raw byte content of the uploaded PDF file.
139
 
140
  Returns:
141
+ str: JSON string of extracted data or an error message.
142
  """
143
+ # Define path relative to the script location or use an absolute path
144
+ pdffigures2_jar_path = "pdffigures2.jar"
145
+ temp_input_path = None # Initialize path variable
146
+
147
+ # --- Input Sanity Check ---
148
+ if not file_bytes or not isinstance(file_bytes, bytes):
149
+ return json.dumps({"error": "Invalid input: No file data received or data is not in bytes format."}, indent=4)
150
+ # Simple check for PDF magic number (%PDF)
151
+ if not file_bytes.startswith(b'%PDF'):
152
+ print("Warning: Input data does not start with PDF magic number (%PDF).")
153
+ # Decide whether to proceed or return an error
154
+ # return json.dumps({"error": "Invalid input: File does not appear to be a PDF."}, indent=4)
155
+
156
+ print(f"Received {len(file_bytes)} bytes of file data.")
157
 
158
  try:
159
+ # Create a temporary file to store the received bytes
160
+ # 'wb' mode is crucial for writing bytes
161
+ # delete=False requires manual cleanup in the finally block
162
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', mode='wb') as temp_input:
163
+ temp_input.write(file_bytes)
164
+ temp_input_path = temp_input.name # Get the path *after* writing
165
+
166
+ # At this point, temp_input is closed, but the file persists because delete=False
167
+ print(f"PDF bytes written to temporary file: {temp_input_path}")
168
+
169
+ # Process the PDF using the path to the temporary file
170
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
 
 
 
171
 
172
  # Return JSON as a formatted string
173
  return json.dumps(extracted_data, indent=4)
174
+
175
+ except FileNotFoundError as e:
176
+ # Provide more specific feedback
177
+ if "pdffigures2.jar" in str(e):
178
+ error_msg = f"Configuration error: pdffigures2 JAR file not found at '{pdffigures2_jar_path}'. Ensure it's present and accessible."
179
+ print(error_msg)
180
+ return json.dumps({"error": error_msg}, indent=4) # Return error as JSON
181
+ elif temp_input_path and str(temp_input_path) in str(e):
182
+ error_msg = f"Internal error: Could not find the temporary PDF file '{temp_input_path}' after creating it. {e}"
183
+ print(error_msg)
184
+ return json.dumps({"error": error_msg}, indent=4)
185
+ else:
186
+ error_msg = f"File system error: {e}"
187
+ print(error_msg)
188
+ return json.dumps({"error": error_msg}, indent=4)
189
  except Exception as e:
190
+ # Log the full error for server-side debugging
191
+ print("------ ERROR in process_file ------")
192
+ traceback.print_exc()
193
+ print("-----------------------------------")
194
+ # Return a generic error message as JSON to the client
195
+ return json.dumps({"error": f"An unexpected error occurred during processing: {type(e).__name__}"}, indent=4)
196
+ finally:
197
+ # Clean up the temporary *input* file if its path was assigned
198
+ if temp_input_path and os.path.exists(temp_input_path):
199
+ try:
200
+ os.unlink(temp_input_path)
201
+ print(f"Cleaned up temporary input file: {temp_input_path}")
202
+ except Exception as cleanup_error:
203
+ print(f"Warning: Failed to cleanup temporary input file {temp_input_path}: {cleanup_error}")
204
 
205
  # Gradio Interface
206
  def gradio_interface():
207
+ # --- Pre-launch Checks ---
208
+ pdffigures2_jar_path = "pdffigures2.jar"
209
+ if not os.path.exists(pdffigures2_jar_path):
210
+ print(f"ERROR: {pdffigures2_jar_path} not found in the current directory.")
211
+ print("Please download it and place it alongside this script.")
212
+ # Consider exiting if the JAR is essential for the app to function
213
+ # import sys
214
+ # sys.exit(1)
215
+
216
+ try:
217
+ # Check for Java (more robust check)
218
+ print("Checking for Java installation...")
219
+ java_check = subprocess.run(["java", "-version"], capture_output=True, text=True, timeout=10)
220
+ # java -version often prints to stderr
221
+ if java_check.returncode == 0 and ("version" in java_check.stdout or "version" in java_check.stderr):
222
+ print("Java installation found:")
223
+ print(java_check.stderr if "version" in java_check.stderr else java_check.stdout)
224
+ else:
225
+ print(f"Warning: 'java -version' returned code {java_check.returncode} or did not contain 'version'. Ensure Java is correctly installed and in PATH.")
226
+ print("STDOUT:", java_check.stdout)
227
+ print("STDERR:", java_check.stderr)
228
+ # Decide if this is critical enough to exit
229
+ except FileNotFoundError:
230
+ print("ERROR: 'java' command not found. Please install Java (JRE or JDK) and ensure it's in your system's PATH.")
231
+ # import sys
232
+ # sys.exit(1)
233
+ except subprocess.TimeoutExpired:
234
+ print("ERROR: 'java -version' command timed out. Java installation might be corrupted.")
235
+ # import sys
236
+ # sys.exit(1)
237
+ except Exception as e:
238
+ print(f"ERROR: An unexpected error occurred while checking for Java: {e}")
239
+ # import sys
240
+ # sys.exit(1)
241
+
242
+
243
+ # --- Define Interface ---
244
  interface = gr.Interface(
245
  fn=process_file,
246
+ # Expect binary data from the client
247
+ inputs=gr.File(
248
+ label="Upload PDF File",
249
+ file_types=[".pdf"],
250
+ type="binary" # Crucial change: accept bytes
251
+ ),
252
+ outputs=gr.Textbox(
253
+ label="Analysis Results (JSON)", # More descriptive label
254
+ lines=20
255
+ ),
256
+ title="PDF Figure/Table Caption Placement Checker",
257
+ description="Upload a PDF file (via API or UI). This tool uses pdffigures2 to identify figures/tables and checks if captions are placed correctly (Tables: Above, Figures: Below). Returns results as JSON.",
258
+ # Optional: Add examples if running interactively
259
+ # examples=[["path/to/example1.pdf"], ["path/to/example2.pdf"]]
260
  )
261
  return interface
262
 
263
  if __name__ == "__main__":
264
  # Launch the Gradio app
265
+ # share=True creates a public link (use with caution)
266
+ # server_name="0.0.0.0" makes it accessible on the network
267
+ gradio_interface().launch(server_name="0.0.0.0")