qtpi commited on
Commit
e31f53e
·
verified ·
1 Parent(s): 1fe9f7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -203
app.py CHANGED
@@ -3,55 +3,49 @@ import json
3
  import os
4
  import gradio as gr
5
  import tempfile
6
- import shutil # Needed for rmtree
7
- import traceback # For detailed error logging
8
 
9
- # Set JAVA_HOME environment variable (Uncomment and set if needed)
10
- # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64' # Example path
11
 
12
- def grade_pdf(input_pdf_path, pdffigures2_jar):
13
  """
14
  Process a PDF file using pdffigures2 and return the extracted JSON output.
 
15
  Args:
16
- input_pdf_path (str): Path to the input PDF file.
 
17
  pdffigures2_jar (str): Path to the pdffigures2 JAR file.
 
18
  Returns:
19
- list: List of issues found based on figure/table caption rules.
 
20
  Raises:
21
  FileNotFoundError: If the input PDF or JAR file does not exist.
22
  Exception: If the command fails or the JSON output is not generated.
23
  """
24
  # Check if input file and JAR file exist
25
- if not os.path.exists(input_pdf_path):
26
- raise FileNotFoundError(f"Input PDF file not found: {input_pdf_path}")
27
  if not os.path.exists(pdffigures2_jar):
28
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
29
 
30
- temp_output_dir = None # Initialize
31
- try:
32
- # Create a temporary directory for the output JSON
33
- temp_output_dir = tempfile.mkdtemp()
34
- # Construct the expected output JSON file path INSIDE the temp directory
35
- # pdffigures2 creates a JSON named after the input file inside the dir specified by -d
36
- output_json_file = os.path.join(temp_output_dir, os.path.splitext(os.path.basename(input_pdf_path))[0] + ".json")
37
- print(f"Expecting output JSON at: {output_json_file}")
38
 
39
- # Command to execute pdffigures2
40
- # Use -d to specify the *directory* for the JSON output
41
- command = [
42
- "java", "-jar", pdffigures2_jar,
43
- input_pdf_path,
44
- "-d", temp_output_dir # Output JSON to the temp directory
45
- ]
46
- print(f"Running command: {' '.join(command)}")
47
 
 
48
  # Run the command
49
- result = subprocess.run(command, capture_output=True, text=True, check=True, timeout=60) # Added timeout
50
- print("pdffigures2 stdout:", result.stdout)
51
- # Stderr might contain useful info even on success for pdffigures2
52
- if result.stderr:
53
- print("pdffigures2 stderr:", result.stderr)
54
 
 
 
 
 
55
 
56
  # Read and return the JSON data
57
  if os.path.exists(output_json_file):
@@ -61,207 +55,81 @@ def grade_pdf(input_pdf_path, pdffigures2_jar):
61
  issueList = list()
62
 
63
  for region in regions:
64
- # Determine caption location
65
- caption_loc = "Above" # Default
66
- # Ensure keys exist before accessing
67
- if "captionBoundary" in region and "regionBoundary" in region and \
68
- isinstance(region["captionBoundary"], dict) and isinstance(region["regionBoundary"], dict) and \
69
- "y1" in region["captionBoundary"] and "y1" in region["regionBoundary"]:
70
-
71
- if(region["captionBoundary"]["y1"] > region["regionBoundary"]["y1"]):
72
- caption_loc = "Below"
73
  else:
74
- print(f"Warning: Missing boundary information for a region on page {region.get('page', 'N/A')}. Skipping location check.")
75
- continue # Skip this region if boundaries are missing
76
-
77
- region["captionLocation"] = caption_loc # Store for potential use/debugging
78
-
79
- fig_type = region.get("figType") # Use .get for safety
80
- page_num = region.get("page", -1) + 1 # Use .get and handle potential missing key
81
-
82
- issues = {} # Initialize for this specific issue potential
83
-
84
- # Check for Table caption below
85
- if caption_loc == "Below" and fig_type == "Table":
86
- issues = {
87
- "page_number": page_num,
88
- "caption_coordinate": region.get("captionBoundary"),
89
- "fig_type": fig_type,
90
- "caption_location": caption_loc,
91
- "description": "Location of the caption for tables must be above the table."
92
- }
93
  issueList.append(issues)
94
-
95
- # Check for Figure caption above
96
- elif caption_loc == "Above" and fig_type == "Figure":
97
- issues = {
98
- "page_number": page_num,
99
- "caption_coordinate": region.get("captionBoundary"),
100
- "fig_type": fig_type,
101
- "caption_location": caption_loc,
102
- "description": "Location of the caption for figures must be below the figure."
103
- }
104
- issueList.append(issues)
105
-
106
  return issueList
107
 
108
  else:
109
- # If JSON not found, include stderr from the process
110
- raise Exception(f"JSON output file not found: {output_json_file}. pdffigures2 stderr: {result.stderr if result else 'No result object'}")
111
 
112
  except subprocess.CalledProcessError as e:
113
- # Include command output/error in the exception
114
- raise Exception(f"Error while running pdffigures2 (return code {e.returncode}):\nSTDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}")
115
- except subprocess.TimeoutExpired as e:
116
- raise Exception(f"pdffigures2 command timed out after {e.timeout} seconds.\nSTDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}")
117
- except Exception as e:
118
- # Re-raise other exceptions for clarity
119
- print(f"An unexpected error occurred in grade_pdf:")
120
- traceback.print_exc() # Print detailed traceback for debugging
121
- raise e # Re-raise the original exception
122
- finally:
123
- # Clean up the temporary output directory and its contents
124
- if temp_output_dir and os.path.exists(temp_output_dir):
125
- try:
126
- shutil.rmtree(temp_output_dir)
127
- print(f"Cleaned up temporary output directory: {temp_output_dir}")
128
- except Exception as cleanup_error:
129
- print(f"Warning: Failed to cleanup temporary output directory {temp_output_dir}: {cleanup_error}")
130
-
131
 
132
- def process_file(file_bytes: bytes): # Expect raw bytes
133
  """
134
- Wrapper to process PDF bytes received from Gradio client via grade_pdf
135
- and return JSON data.
136
 
137
  Args:
138
- file_bytes (bytes): The raw byte content of the uploaded PDF file.
139
 
140
  Returns:
141
- str: JSON string of extracted data or an error message.
142
  """
143
- # Define path relative to the script location or use an absolute path
144
- pdffigures2_jar_path = "pdffigures2.jar"
145
- temp_input_path = None # Initialize path variable
146
-
147
- # --- Input Sanity Check ---
148
- if not file_bytes or not isinstance(file_bytes, bytes):
149
- return json.dumps({"error": "Invalid input: No file data received or data is not in bytes format."}, indent=4)
150
- # Simple check for PDF magic number (%PDF)
151
- if not file_bytes.startswith(b'%PDF'):
152
- print("Warning: Input data does not start with PDF magic number (%PDF).")
153
- # Decide whether to proceed or return an error
154
- # return json.dumps({"error": "Invalid input: File does not appear to be a PDF."}, indent=4)
155
-
156
- print(f"Received {len(file_bytes)} bytes of file data.")
157
 
158
  try:
159
- # Create a temporary file to store the received bytes
160
- # 'wb' mode is crucial for writing bytes
161
- # delete=False requires manual cleanup in the finally block
162
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', mode='wb') as temp_input:
163
- temp_input.write(file_bytes)
164
- temp_input_path = temp_input.name # Get the path *after* writing
165
-
166
- # At this point, temp_input is closed, but the file persists because delete=False
167
- print(f"PDF bytes written to temporary file: {temp_input_path}")
168
-
169
- # Process the PDF using the path to the temporary file
170
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
 
 
 
171
 
172
  # Return JSON as a formatted string
173
  return json.dumps(extracted_data, indent=4)
174
-
175
- except FileNotFoundError as e:
176
- # Provide more specific feedback
177
- if "pdffigures2.jar" in str(e):
178
- error_msg = f"Configuration error: pdffigures2 JAR file not found at '{pdffigures2_jar_path}'. Ensure it's present and accessible."
179
- print(error_msg)
180
- return json.dumps({"error": error_msg}, indent=4) # Return error as JSON
181
- elif temp_input_path and str(temp_input_path) in str(e):
182
- error_msg = f"Internal error: Could not find the temporary PDF file '{temp_input_path}' after creating it. {e}"
183
- print(error_msg)
184
- return json.dumps({"error": error_msg}, indent=4)
185
- else:
186
- error_msg = f"File system error: {e}"
187
- print(error_msg)
188
- return json.dumps({"error": error_msg}, indent=4)
189
  except Exception as e:
190
- # Log the full error for server-side debugging
191
- print("------ ERROR in process_file ------")
192
- traceback.print_exc()
193
- print("-----------------------------------")
194
- # Return a generic error message as JSON to the client
195
- return json.dumps({"error": f"An unexpected error occurred during processing: {type(e).__name__}"}, indent=4)
196
- finally:
197
- # Clean up the temporary *input* file if its path was assigned
198
- if temp_input_path and os.path.exists(temp_input_path):
199
- try:
200
- os.unlink(temp_input_path)
201
- print(f"Cleaned up temporary input file: {temp_input_path}")
202
- except Exception as cleanup_error:
203
- print(f"Warning: Failed to cleanup temporary input file {temp_input_path}: {cleanup_error}")
204
 
205
  # Gradio Interface
206
  def gradio_interface():
207
- # --- Pre-launch Checks ---
208
- pdffigures2_jar_path = "pdffigures2.jar"
209
- if not os.path.exists(pdffigures2_jar_path):
210
- print(f"ERROR: {pdffigures2_jar_path} not found in the current directory.")
211
- print("Please download it and place it alongside this script.")
212
- # Consider exiting if the JAR is essential for the app to function
213
- # import sys
214
- # sys.exit(1)
215
-
216
- try:
217
- # Check for Java (more robust check)
218
- print("Checking for Java installation...")
219
- java_check = subprocess.run(["java", "-version"], capture_output=True, text=True, timeout=10)
220
- # java -version often prints to stderr
221
- if java_check.returncode == 0 and ("version" in java_check.stdout or "version" in java_check.stderr):
222
- print("Java installation found:")
223
- print(java_check.stderr if "version" in java_check.stderr else java_check.stdout)
224
- else:
225
- print(f"Warning: 'java -version' returned code {java_check.returncode} or did not contain 'version'. Ensure Java is correctly installed and in PATH.")
226
- print("STDOUT:", java_check.stdout)
227
- print("STDERR:", java_check.stderr)
228
- # Decide if this is critical enough to exit
229
- except FileNotFoundError:
230
- print("ERROR: 'java' command not found. Please install Java (JRE or JDK) and ensure it's in your system's PATH.")
231
- # import sys
232
- # sys.exit(1)
233
- except subprocess.TimeoutExpired:
234
- print("ERROR: 'java -version' command timed out. Java installation might be corrupted.")
235
- # import sys
236
- # sys.exit(1)
237
- except Exception as e:
238
- print(f"ERROR: An unexpected error occurred while checking for Java: {e}")
239
- # import sys
240
- # sys.exit(1)
241
-
242
-
243
- # --- Define Interface ---
244
  interface = gr.Interface(
245
  fn=process_file,
246
- # Expect binary data from the client
247
- inputs=gr.File(
248
- label="Upload PDF File",
249
- file_types=[".pdf"],
250
- type="binary" # Crucial change: accept bytes
251
- ),
252
- outputs=gr.Textbox(
253
- label="Analysis Results (JSON)", # More descriptive label
254
- lines=20
255
- ),
256
- title="PDF Figure/Table Caption Placement Checker",
257
- description="Upload a PDF file (via API or UI). This tool uses pdffigures2 to identify figures/tables and checks if captions are placed correctly (Tables: Above, Figures: Below). Returns results as JSON.",
258
- # Optional: Add examples if running interactively
259
- # examples=[["path/to/example1.pdf"], ["path/to/example2.pdf"]]
260
  )
261
  return interface
262
 
263
  if __name__ == "__main__":
264
  # Launch the Gradio app
265
- # share=True creates a public link (use with caution)
266
- # server_name="0.0.0.0" makes it accessible on the network
267
- gradio_interface().launch(server_name="0.0.0.0")
 
3
  import os
4
  import gradio as gr
5
  import tempfile
 
 
6
 
7
+ # Set JAVA_HOME environment variable
8
+ # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
9
 
10
+ def grade_pdf(input_pdf, pdffigures2_jar):
11
  """
12
  Process a PDF file using pdffigures2 and return the extracted JSON output.
13
+
14
  Args:
15
+ input_pdf (str): Path to the input PDF file.
16
+ output_dir (str): Path to the directory where JSON output will be saved.
17
  pdffigures2_jar (str): Path to the pdffigures2 JAR file.
18
+
19
  Returns:
20
+ dict: Parsed JSON data extracted by pdffigures2.
21
+
22
  Raises:
23
  FileNotFoundError: If the input PDF or JAR file does not exist.
24
  Exception: If the command fails or the JSON output is not generated.
25
  """
26
  # Check if input file and JAR file exist
27
+ if not os.path.exists(input_pdf):
28
+ raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
29
  if not os.path.exists(pdffigures2_jar):
30
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
31
 
 
 
 
 
 
 
 
 
32
 
33
+ # Command to execute pdffigures2
34
+ command = [
35
+ "java", "-jar", pdffigures2_jar,
36
+ input_pdf,
37
+ "-d", '_'
38
+ ]
39
+ print(input_pdf)
 
40
 
41
+ try:
42
  # Run the command
43
+ subprocess.run(command, capture_output=True, text=True, check=True)
 
 
 
 
44
 
45
+ # Construct the output JSON file path
46
+ # print()
47
+ output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
48
+ print("output_json_file: ", output_json_file)
49
 
50
  # Read and return the JSON data
51
  if os.path.exists(output_json_file):
 
55
  issueList = list()
56
 
57
  for region in regions:
58
+ issues = dict()
59
+ if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
60
+ region["captionLocation"]="Below"
 
 
 
 
 
 
61
  else:
62
+ region["captionLocation"]="Above"
63
+
64
+ if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
65
+ issues["page_number"]=region["page"]
66
+ issues["caption_coordinate"]=region["captionBoundary"]
67
+ issues["fig_type"] = region["figType"]
68
+ issues["caption_location"] = region["captionLocation"]
69
+ issues["description"]="Location of the caption for tables must be above the table."
 
 
 
 
 
 
 
 
 
 
 
70
  issueList.append(issues)
71
+
72
+ if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
73
+ issues["page_number"]=region["page"]
74
+ issues["caption_coordinate"]=region["captionBoundary"]
75
+ issues["fig_type"] = region["figType"]
76
+ issues["description"]="Location of the caption for figures must be below the table."
77
+ issueList.append(issues)
78
+
 
 
 
 
79
  return issueList
80
 
81
  else:
82
+ raise Exception(f"JSON output file not found: {output_json_file}")
 
83
 
84
  except subprocess.CalledProcessError as e:
85
+ raise Exception(f"Error while running pdffigures2: {e.stderr}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ def process_file(file):
88
  """
89
+ Wrapper to process a file via grade_pdf and return JSON data.
 
90
 
91
  Args:
92
+ file (str): Path to the uploaded PDF file.
93
 
94
  Returns:
95
+ str: JSON string of extracted data.
96
  """
97
+ # Define paths
98
+ pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
99
+
100
+ temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
101
+ temp_input.write(file)
102
+ temp_input_path = temp_input.name
103
+ print("Path of the input file: ",temp_input_path)
 
 
 
 
 
 
 
104
 
105
  try:
106
+ # Process the PDF and get JSON data
107
+ # print(file.name)
 
 
 
 
 
 
 
 
 
108
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
109
+ # Clean up the temporary input file
110
+ os.unlink(temp_input_path)
111
+
112
 
113
  # Return JSON as a formatted string
114
  return json.dumps(extracted_data, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  except Exception as e:
116
+ return f"An error occurred: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # Gradio Interface
119
  def gradio_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  interface = gr.Interface(
121
  fn=process_file,
122
+
123
+ inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
124
+
125
+ outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
126
+
127
+ title="PDF Grading Interface",
128
+
129
+ description="Upload a PDF file, and this tool will extract figures and captions as JSON."
 
 
 
 
 
 
130
  )
131
  return interface
132
 
133
  if __name__ == "__main__":
134
  # Launch the Gradio app
135
+ gradio_interface().launch()