qtpi commited on
Commit
a879c17
·
verified ·
1 Parent(s): d0f0aa4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -80
app.py CHANGED
@@ -4,132 +4,82 @@ import os
4
  import gradio as gr
5
  import tempfile
6
 
7
- # Set JAVA_HOME environment variable
8
- # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
9
-
10
  def grade_pdf(input_pdf, pdffigures2_jar):
11
- """
12
- Process a PDF file using pdffigures2 and return the extracted JSON output.
13
-
14
- Args:
15
- input_pdf (str): Path to the input PDF file.
16
- output_dir (str): Path to the directory where JSON output will be saved.
17
- pdffigures2_jar (str): Path to the pdffigures2 JAR file.
18
-
19
- Returns:
20
- dict: Parsed JSON data extracted by pdffigures2.
21
-
22
- Raises:
23
- FileNotFoundError: If the input PDF or JAR file does not exist.
24
- Exception: If the command fails or the JSON output is not generated.
25
- """
26
- # Check if input file and JAR file exist
27
  if not os.path.exists(input_pdf):
28
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
29
  if not os.path.exists(pdffigures2_jar):
30
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
31
 
32
-
33
- # Command to execute pdffigures2
34
  command = [
35
  "java", "-jar", pdffigures2_jar,
36
  input_pdf,
37
- "-d", '_'
38
  ]
39
- print(input_pdf)
40
 
41
  try:
42
- # Run the command
43
  subprocess.run(command, capture_output=True, text=True, check=True)
44
 
45
- # Construct the output JSON file path
46
- # print()
47
- output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
48
- print("output_json_file: ", output_json_file)
49
 
50
- # Read and return the JSON data
51
  if os.path.exists(output_json_file):
52
  with open(output_json_file, "r") as f:
53
  regions = json.load(f)
54
 
55
- issueList = list()
56
 
57
  for region in regions:
58
- issues = dict()
59
- if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
60
- region["captionLocation"]="Below"
61
  else:
62
- region["captionLocation"]="Above"
63
 
64
- if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
65
- issues["page_number"]=region["page"]
66
- issues["caption_coordinate"]=region["captionBoundary"]
67
  issues["fig_type"] = region["figType"]
68
  issues["caption_location"] = region["captionLocation"]
69
- issues["description"]="Location of the caption for tables must be above the table."
70
- issueList.append(issues)
71
-
72
- if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
73
- issues["page_number"]=region["page"]
74
- issues["caption_coordinate"]=region["captionBoundary"]
75
  issues["fig_type"] = region["figType"]
76
- issues["description"]="Location of the caption for figures must be below the table."
77
- issueList.append(issues)
78
-
79
- return issueList
80
 
 
81
  else:
82
  raise Exception(f"JSON output file not found: {output_json_file}")
83
 
84
  except subprocess.CalledProcessError as e:
85
- raise Exception(f"Error while running pdffigures2: {e.stderr}")
86
 
87
  def process_file(file):
88
- """
89
- Wrapper to process a file via grade_pdf and return JSON data.
90
-
91
- Args:
92
- file (str): Path to the uploaded PDF file.
93
-
94
- Returns:
95
- str: JSON string of extracted data.
96
- """
97
- # Define paths
98
- pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
99
-
100
- temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
101
- temp_input.write(file)
102
- temp_input_path = temp_input.name
103
- print("Path of the input file: ",temp_input_path)
104
 
105
  try:
106
- # Process the PDF and get JSON data
107
- # print(file.name)
108
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
109
- # Clean up the temporary input file
110
  os.unlink(temp_input_path)
111
-
112
-
113
- # Return JSON as a formatted string
114
  return json.dumps(extracted_data, indent=4)
115
  except Exception as e:
116
- return f"An error occurred: {e}"
117
 
118
- # Gradio Interface
119
  def gradio_interface():
120
  interface = gr.Interface(
121
  fn=process_file,
122
-
123
- inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
124
-
125
  outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
126
-
127
  title="PDF Grading Interface",
128
-
129
- description="Upload a PDF file, and this tool will extract figures and captions as JSON."
130
  )
131
  return interface
132
 
133
  if __name__ == "__main__":
134
- # Launch the Gradio app
135
- gradio_interface().launch()
 
4
  import gradio as gr
5
  import tempfile
6
 
 
 
 
7
  def grade_pdf(input_pdf, pdffigures2_jar):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  if not os.path.exists(input_pdf):
9
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
10
  if not os.path.exists(pdffigures2_jar):
11
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
12
 
 
 
13
  command = [
14
  "java", "-jar", pdffigures2_jar,
15
  input_pdf,
16
+ "-d", "_"
17
  ]
 
18
 
19
  try:
 
20
  subprocess.run(command, capture_output=True, text=True, check=True)
21
 
22
+ output_json_file = "_" + os.path.splitext(os.path.basename(input_pdf))[0] + ".json"
 
 
 
23
 
 
24
  if os.path.exists(output_json_file):
25
  with open(output_json_file, "r") as f:
26
  regions = json.load(f)
27
 
28
+ issue_list = []
29
 
30
  for region in regions:
31
+ issues = {}
32
+ if region["captionBoundary"]["y1"] > region["regionBoundary"]["y1"]:
33
+ region["captionLocation"] = "Below"
34
  else:
35
+ region["captionLocation"] = "Above"
36
 
37
+ if region["captionLocation"] == "Below" and region["figType"] == "Table":
38
+ issues["page_number"] = region["page"]+1
39
+ issues["caption_coordinate"] = region["captionBoundary"]
40
  issues["fig_type"] = region["figType"]
41
  issues["caption_location"] = region["captionLocation"]
42
+ issues["description"] = "Location of the caption for tables must be above the table."
43
+ issue_list.append(issues)
44
+
45
+ if region["captionLocation"] == "Above" and region["figType"] == "Figure":
46
+ issues["page_number"] = region["page"]+1
47
+ issues["caption_coordinate"] = region["captionBoundary"]
48
  issues["fig_type"] = region["figType"]
49
+ issues["caption_location"] = region["captionLocation"]
50
+ issues["description"] = "Location of the caption for figures must be below the figure."
51
+ issue_list.append(issues)
 
52
 
53
+ return issue_list
54
  else:
55
  raise Exception(f"JSON output file not found: {output_json_file}")
56
 
57
  except subprocess.CalledProcessError as e:
58
+ raise Exception(f"Error while running pdffigures2:\nSTDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}")
59
 
60
  def process_file(file):
61
+ pdffigures2_jar_path = "pdffigures2.jar"
62
+
63
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", mode="wb") as temp_input:
64
+ temp_input.write(file.read())
65
+ temp_input_path = temp_input.name
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  try:
 
 
68
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
 
69
  os.unlink(temp_input_path)
 
 
 
70
  return json.dumps(extracted_data, indent=4)
71
  except Exception as e:
72
+ return f"An error occurred:\n{str(e)}"
73
 
 
74
  def gradio_interface():
75
  interface = gr.Interface(
76
  fn=process_file,
77
+ inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
 
 
78
  outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
 
79
  title="PDF Grading Interface",
80
+ description="Upload a PDF to detect improper caption positions for tables and figures."
 
81
  )
82
  return interface
83
 
84
  if __name__ == "__main__":
85
+ gradio_interface().launch()