qtpi commited on
Commit
3d04d70
·
verified ·
1 Parent(s): 284b991

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -85
app.py CHANGED
@@ -4,132 +4,80 @@ import os
4
  import gradio as gr
5
  import tempfile
6
 
7
- # Set JAVA_HOME environment variable
8
- # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
9
-
10
  def grade_pdf(input_pdf, pdffigures2_jar):
11
- """
12
- Process a PDF file using pdffigures2 and return the extracted JSON output.
13
-
14
- Args:
15
- input_pdf (str): Path to the input PDF file.
16
- output_dir (str): Path to the directory where JSON output will be saved.
17
- pdffigures2_jar (str): Path to the pdffigures2 JAR file.
18
-
19
- Returns:
20
- dict: Parsed JSON data extracted by pdffigures2.
21
-
22
- Raises:
23
- FileNotFoundError: If the input PDF or JAR file does not exist.
24
- Exception: If the command fails or the JSON output is not generated.
25
- """
26
- # Check if input file and JAR file exist
27
  if not os.path.exists(input_pdf):
28
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
29
  if not os.path.exists(pdffigures2_jar):
30
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
31
 
32
-
33
- # Command to execute pdffigures2
34
  command = [
35
  "java", "-jar", pdffigures2_jar,
36
  input_pdf,
37
- "-d", '_'
38
  ]
39
- print(input_pdf)
40
-
41
  try:
42
- # Run the command
43
  subprocess.run(command, capture_output=True, text=True, check=True)
44
 
45
- # Construct the output JSON file path
46
- # print()
47
- output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
48
- print("output_json_file: ", output_json_file)
49
-
50
- # Read and return the JSON data
51
  if os.path.exists(output_json_file):
52
  with open(output_json_file, "r") as f:
53
  regions = json.load(f)
54
 
55
- issueList = list()
56
-
57
  for region in regions:
58
- issues = dict()
59
- if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
60
- region["captionLocation"]="Below"
61
- else:
62
- region["captionLocation"]="Above"
63
-
64
- if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
65
- issues["page_number"]=region["page"]+1
66
- issues["caption_coordinate"]=region["captionBoundary"]
67
- issues["fig_type"] = region["figType"]
68
- issues["caption_location"] = region["captionLocation"]
69
- issues["description"]="Location of the caption for tables must be above the table."
70
  issueList.append(issues)
71
-
72
- if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
73
- issues["page_number"]=region["page"]+1
74
- issues["caption_coordinate"]=region["captionBoundary"]
75
- issues["fig_type"] = region["figType"]
76
- issues["description"]="Location of the caption for figures must be below the table."
 
 
 
77
  issueList.append(issues)
78
-
79
- return issueList
80
 
 
81
  else:
82
  raise Exception(f"JSON output file not found: {output_json_file}")
83
-
84
  except subprocess.CalledProcessError as e:
85
  raise Exception(f"Error while running pdffigures2: {e.stderr}")
86
 
87
- def process_file(file):
88
- """
89
- Wrapper to process a file via grade_pdf and return JSON data.
90
-
91
- Args:
92
- file (str): Path to the uploaded PDF file.
93
 
94
- Returns:
95
- str: JSON string of extracted data.
96
- """
97
- # Define paths
98
- pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
99
-
100
- temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
101
- temp_input.write(file)
102
- temp_input_path = temp_input.name
103
- print("Path of the input file: ",temp_input_path)
104
 
105
  try:
106
- # Process the PDF and get JSON data
107
- # print(file.name)
108
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
109
- # Clean up the temporary input file
110
  os.unlink(temp_input_path)
111
-
112
-
113
- # Return JSON as a formatted string
114
  return json.dumps(extracted_data, indent=4)
115
  except Exception as e:
116
  return f"An error occurred: {e}"
117
 
118
- # Gradio Interface
119
  def gradio_interface():
120
  interface = gr.Interface(
121
  fn=process_file,
122
-
123
- inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
124
-
125
- outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
126
-
127
  title="PDF Grading Interface",
128
-
129
  description="Upload a PDF file, and this tool will extract figures and captions as JSON."
130
  )
131
  return interface
132
 
133
  if __name__ == "__main__":
134
- # Launch the Gradio app
135
- gradio_interface().launch()
 
4
  import gradio as gr
5
  import tempfile
6
 
 
 
 
7
  def grade_pdf(input_pdf, pdffigures2_jar):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  if not os.path.exists(input_pdf):
9
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
10
  if not os.path.exists(pdffigures2_jar):
11
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
12
 
 
 
13
  command = [
14
  "java", "-jar", pdffigures2_jar,
15
  input_pdf,
16
+ "-d", "_"
17
  ]
 
 
18
  try:
 
19
  subprocess.run(command, capture_output=True, text=True, check=True)
20
 
21
+ output_json_file = "_" + os.path.splitext(os.path.basename(input_pdf))[0] + ".json"
 
 
 
 
 
22
  if os.path.exists(output_json_file):
23
  with open(output_json_file, "r") as f:
24
  regions = json.load(f)
25
 
26
+ issueList = []
 
27
  for region in regions:
28
+ issues = {}
29
+ caption_below = region["captionBoundary"]["y1"] > region["regionBoundary"]["y1"]
30
+ region["captionLocation"] = "Below" if caption_below else "Above"
31
+
32
+ if region["captionLocation"] == "Below" and region["figType"] == "Table":
33
+ issues.update({
34
+ "page_number": region["page"] + 1,
35
+ "caption_coordinate": region["captionBoundary"],
36
+ "fig_type": region["figType"],
37
+ "caption_location": region["captionLocation"],
38
+ "description": "Location of the caption for tables must be above the table."
39
+ })
40
  issueList.append(issues)
41
+
42
+ if region["captionLocation"] == "Above" and region["figType"] == "Figure":
43
+ issues.update({
44
+ "page_number": region["page"] + 1,
45
+ "caption_coordinate": region["captionBoundary"],
46
+ "fig_type": region["figType"],
47
+ "caption_location": region["captionLocation"],
48
+ "description": "Location of the caption for figures must be below the figure."
49
+ })
50
  issueList.append(issues)
 
 
51
 
52
+ return issueList
53
  else:
54
  raise Exception(f"JSON output file not found: {output_json_file}")
 
55
  except subprocess.CalledProcessError as e:
56
  raise Exception(f"Error while running pdffigures2: {e.stderr}")
57
 
58
+ def process_file(file_obj):
59
+ pdffigures2_jar_path = "pdffigures2.jar"
 
 
 
 
60
 
61
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_input:
62
+ temp_input.write(file_obj.read())
63
+ temp_input_path = temp_input.name
 
 
 
 
 
 
 
64
 
65
  try:
 
 
66
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
 
67
  os.unlink(temp_input_path)
 
 
 
68
  return json.dumps(extracted_data, indent=4)
69
  except Exception as e:
70
  return f"An error occurred: {e}"
71
 
 
72
  def gradio_interface():
73
  interface = gr.Interface(
74
  fn=process_file,
75
+ inputs=gr.File(file_types=[".pdf"], label="Upload PDF File"),
76
+ outputs=gr.Textbox(lines=20, label="Extracted JSON Data"),
 
 
 
77
  title="PDF Grading Interface",
 
78
  description="Upload a PDF file, and this tool will extract figures and captions as JSON."
79
  )
80
  return interface
81
 
82
  if __name__ == "__main__":
83
+ gradio_interface().launch()