qtpi commited on
Commit
da3917e
·
verified ·
1 Parent(s): 79edf21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -36
app.py CHANGED
@@ -4,54 +4,80 @@ import os
4
  import gradio as gr
5
  import tempfile
6
 
 
 
 
7
  def grade_pdf(input_pdf, pdffigures2_jar):
8
  """
9
  Process a PDF file using pdffigures2 and return the extracted JSON output.
 
 
 
 
 
 
 
 
 
 
 
 
10
  """
 
11
  if not os.path.exists(input_pdf):
12
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
13
  if not os.path.exists(pdffigures2_jar):
14
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
15
 
 
 
16
  command = [
17
  "java", "-jar", pdffigures2_jar,
18
  input_pdf,
19
- "-d", "_"
20
  ]
 
21
 
22
  try:
 
23
  subprocess.run(command, capture_output=True, text=True, check=True)
24
 
25
- output_json_file = "_" + os.path.splitext(os.path.basename(input_pdf))[0] + ".json"
 
 
 
 
 
26
  if os.path.exists(output_json_file):
27
  with open(output_json_file, "r") as f:
28
  regions = json.load(f)
29
 
30
- issueList = []
31
 
32
  for region in regions:
33
- issues = {}
34
- caption_y = region["captionBoundary"]["y1"]
35
- region_y = region["regionBoundary"]["y1"]
36
- region["captionLocation"] = "Below" if caption_y > region_y else "Above"
37
-
38
- if region["captionLocation"] == "Below" and region["figType"] == "Table":
39
- issues["page_number"] = region["page"]
40
- issues["caption_coordinate"] = region["captionBoundary"]
 
41
  issues["fig_type"] = region["figType"]
42
  issues["caption_location"] = region["captionLocation"]
43
- issues["description"] = "Location of the caption for tables must be above the table."
44
  issueList.append(issues)
45
-
46
- if region["captionLocation"] == "Above" and region["figType"] == "Figure":
47
- issues["page_number"] = region["page"]
48
- issues["caption_coordinate"] = region["captionBoundary"]
49
  issues["fig_type"] = region["figType"]
50
- issues["caption_location"] = region["captionLocation"]
51
- issues["description"] = "Location of the caption for figures must be below the figure."
52
  issueList.append(issues)
53
-
54
  return issueList
 
55
  else:
56
  raise Exception(f"JSON output file not found: {output_json_file}")
57
 
@@ -60,36 +86,50 @@ def grade_pdf(input_pdf, pdffigures2_jar):
60
 
61
  def process_file(file):
62
  """
63
- Wrapper to process a file via grade_pdf and return JSON data or error as a string.
64
- """
65
- pdffigures2_jar_path = "pdffigures2.jar"
 
66
 
 
 
 
 
 
 
67
  temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
68
- try:
69
- temp_input.write(file)
70
- temp_input.close()
71
- temp_input_path = temp_input.name
72
 
 
 
 
73
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
 
 
 
74
 
 
75
  return json.dumps(extracted_data, indent=4)
76
-
77
  except Exception as e:
78
- return f"An error occurred: {str(e)}"
79
 
80
- finally:
81
- if os.path.exists(temp_input.name):
82
- os.unlink(temp_input.name)
83
-
84
  def gradio_interface():
85
  interface = gr.Interface(
86
  fn=process_file,
 
87
  inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
88
- outputs=gr.Textbox(label="Output", lines=20, type="text"), # <- FIX HERE
89
- title="PDF Caption Grader",
90
- description="Detects misplacement of captions in figures and tables in PDF files."
 
 
 
91
  )
92
  return interface
93
 
94
  if __name__ == "__main__":
95
- gradio_interface().launch()
 
 
4
  import gradio as gr
5
  import tempfile
6
 
7
+ # Set JAVA_HOME environment variable
8
+ # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
9
+
10
  def grade_pdf(input_pdf, pdffigures2_jar):
11
  """
12
  Process a PDF file using pdffigures2 and return the extracted JSON output.
13
+
14
+ Args:
15
+ input_pdf (str): Path to the input PDF file.
16
+ output_dir (str): Path to the directory where JSON output will be saved.
17
+ pdffigures2_jar (str): Path to the pdffigures2 JAR file.
18
+
19
+ Returns:
20
+ dict: Parsed JSON data extracted by pdffigures2.
21
+
22
+ Raises:
23
+ FileNotFoundError: If the input PDF or JAR file does not exist.
24
+ Exception: If the command fails or the JSON output is not generated.
25
  """
26
+ # Check if input file and JAR file exist
27
  if not os.path.exists(input_pdf):
28
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
29
  if not os.path.exists(pdffigures2_jar):
30
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
31
 
32
+
33
+ # Command to execute pdffigures2
34
  command = [
35
  "java", "-jar", pdffigures2_jar,
36
  input_pdf,
37
+ "-d", '_'
38
  ]
39
+ print(input_pdf)
40
 
41
  try:
42
+ # Run the command
43
  subprocess.run(command, capture_output=True, text=True, check=True)
44
 
45
+ # Construct the output JSON file path
46
+ # print()
47
+ output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
48
+ print("output_json_file: ", output_json_file)
49
+
50
+ # Read and return the JSON data
51
  if os.path.exists(output_json_file):
52
  with open(output_json_file, "r") as f:
53
  regions = json.load(f)
54
 
55
+ issueList = list()
56
 
57
  for region in regions:
58
+ issues = dict()
59
+ if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
60
+ region["captionLocation"]="Below"
61
+ else:
62
+ region["captionLocation"]="Above"
63
+
64
+ if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
65
+ issues["page_number"]=region["page"]+1
66
+ issues["caption_coordinate"]=region["captionBoundary"]
67
  issues["fig_type"] = region["figType"]
68
  issues["caption_location"] = region["captionLocation"]
69
+ issues["description"]="Location of the caption for tables must be above the table."
70
  issueList.append(issues)
71
+
72
+ if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
73
+ issues["page_number"]=region["page"]+1
74
+ issues["caption_coordinate"]=region["captionBoundary"]
75
  issues["fig_type"] = region["figType"]
76
+ issues["description"]="Location of the caption for figures must be below the table."
 
77
  issueList.append(issues)
78
+
79
  return issueList
80
+
81
  else:
82
  raise Exception(f"JSON output file not found: {output_json_file}")
83
 
 
86
 
87
  def process_file(file):
88
  """
89
+ Wrapper to process a file via grade_pdf and return JSON data.
90
+
91
+ Args:
92
+ file (str): Path to the uploaded PDF file.
93
 
94
+ Returns:
95
+ str: JSON string of extracted data.
96
+ """
97
+ # Define paths
98
+ pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
99
+
100
  temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
101
+ temp_input.write(file)
102
+ temp_input_path = temp_input.name
103
+ print("Path of the input file: ",temp_input_path)
 
104
 
105
+ try:
106
+ # Process the PDF and get JSON data
107
+ # print(file.name)
108
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
109
+ # Clean up the temporary input file
110
+ os.unlink(temp_input_path)
111
+
112
 
113
+ # Return JSON as a formatted string
114
  return json.dumps(extracted_data, indent=4)
 
115
  except Exception as e:
116
+ return f"An error occurred: {e}"
117
 
118
+ # Gradio Interface
 
 
 
119
  def gradio_interface():
120
  interface = gr.Interface(
121
  fn=process_file,
122
+
123
  inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
124
+
125
+ outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
126
+
127
+ title="PDF Grading Interface",
128
+
129
+ description="Upload a PDF file, and this tool will extract figures and captions as JSON."
130
  )
131
  return interface
132
 
133
  if __name__ == "__main__":
134
+ # Launch the Gradio app
135
+ gradio_interface().launch()