qtpi commited on
Commit
38eb864
·
verified ·
1 Parent(s): 369b866

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -34
app.py CHANGED
@@ -4,80 +4,128 @@ import os
4
  import gradio as gr
5
  import tempfile
6
 
 
 
 
7
  def grade_pdf(input_pdf, pdffigures2_jar):
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  if not os.path.exists(input_pdf):
9
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
10
  if not os.path.exists(pdffigures2_jar):
11
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
12
 
 
 
13
  command = [
14
  "java", "-jar", pdffigures2_jar,
15
  input_pdf,
16
- "-d", "_"
17
  ]
 
 
18
  try:
 
19
  subprocess.run(command, capture_output=True, text=True, check=True)
20
 
21
- output_json_file = "_" + os.path.splitext(os.path.basename(input_pdf))[0] + ".json"
 
 
 
 
 
22
  if os.path.exists(output_json_file):
23
  with open(output_json_file, "r") as f:
24
  regions = json.load(f)
25
 
26
- issueList = []
 
27
  for region in regions:
28
- issues = {}
29
- caption_below = region["captionBoundary"]["y1"] > region["regionBoundary"]["y1"]
30
- region["captionLocation"] = "Below" if caption_below else "Above"
31
-
32
- if region["captionLocation"] == "Below" and region["figType"] == "Table":
33
- issues.update({
34
- "page_number": region["page"] + 1,
35
- "caption_coordinate": region["captionBoundary"],
36
- "fig_type": region["figType"],
37
- "caption_location": region["captionLocation"],
38
- "description": "Location of the caption for tables must be above the table."
39
- })
40
- issueList.append(issues)
41
 
42
- if region["captionLocation"] == "Above" and region["figType"] == "Figure":
43
- issues.update({
44
- "page_number": region["page"] + 1,
45
- "caption_coordinate": region["captionBoundary"],
46
- "fig_type": region["figType"],
47
- "caption_location": region["captionLocation"],
48
- "description": "Location of the caption for figures must be below the figure."
49
- })
50
  issueList.append(issues)
51
-
 
 
 
 
 
 
 
52
  return issueList
 
53
  else:
54
  raise Exception(f"JSON output file not found: {output_json_file}")
 
55
  except subprocess.CalledProcessError as e:
56
  raise Exception(f"Error while running pdffigures2: {e.stderr}")
57
 
58
- def process_file(file_obj):
59
- pdffigures2_jar_path = "pdffigures2.jar"
60
-
61
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_input:
62
- temp_input.write(file_obj.read())
63
- temp_input_path = temp_input.name
 
 
 
 
 
 
 
 
 
64
 
65
  try:
 
 
66
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
 
67
  os.unlink(temp_input_path)
 
 
 
68
  return json.dumps(extracted_data, indent=4)
69
  except Exception as e:
70
  return f"An error occurred: {e}"
71
 
 
72
  def gradio_interface():
73
  interface = gr.Interface(
74
  fn=process_file,
75
- inputs=gr.File(file_types=[".pdf"], label="Upload PDF File"),
76
- outputs=gr.Textbox(lines=20, label="Extracted JSON Data"),
 
 
 
77
  title="PDF Grading Interface",
 
78
  description="Upload a PDF file, and this tool will extract figures and captions as JSON."
79
  )
80
  return interface
81
 
82
  if __name__ == "__main__":
83
- gradio_interface().launch()
 
 
 
4
  import gradio as gr
5
  import tempfile
6
 
7
+ # Set JAVA_HOME environment variable
8
+ # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
9
+
10
  def grade_pdf(input_pdf, pdffigures2_jar):
11
+ """
12
+ Process a PDF file using pdffigures2 and return the extracted JSON output.
13
+ Args:
14
+ input_pdf (str): Path to the input PDF file.
15
+ output_dir (str): Path to the directory where JSON output will be saved.
16
+ pdffigures2_jar (str): Path to the pdffigures2 JAR file.
17
+ Returns:
18
+ dict: Parsed JSON data extracted by pdffigures2.
19
+ Raises:
20
+ FileNotFoundError: If the input PDF or JAR file does not exist.
21
+ Exception: If the command fails or the JSON output is not generated.
22
+ """
23
+ # Check if input file and JAR file exist
24
  if not os.path.exists(input_pdf):
25
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
26
  if not os.path.exists(pdffigures2_jar):
27
  raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
28
 
29
+
30
+ # Command to execute pdffigures2
31
  command = [
32
  "java", "-jar", pdffigures2_jar,
33
  input_pdf,
34
+ "-d", '_'
35
  ]
36
+ print(input_pdf)
37
+
38
  try:
39
+ # Run the command
40
  subprocess.run(command, capture_output=True, text=True, check=True)
41
 
42
+ # Construct the output JSON file path
43
+ # print()
44
+ output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
45
+ print("output_json_file: ", output_json_file)
46
+
47
+ # Read and return the JSON data
48
  if os.path.exists(output_json_file):
49
  with open(output_json_file, "r") as f:
50
  regions = json.load(f)
51
 
52
+ issueList = list()
53
+
54
  for region in regions:
55
+ issues = dict()
56
+ if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
57
+ region["captionLocation"]="Below"
58
+ else:
59
+ region["captionLocation"]="Above"
 
 
 
 
 
 
 
 
60
 
61
+ if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
62
+ issues["page_number"]=region["page"]
63
+ issues["caption_coordinate"]=region["captionBoundary"]
64
+ issues["fig_type"] = region["figType"]
65
+ issues["caption_location"] = region["captionLocation"]
66
+ issues["description"]="Location of the caption for tables must be above the table."
 
 
67
  issueList.append(issues)
68
+
69
+ if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
70
+ issues["page_number"]=region["page"]
71
+ issues["caption_coordinate"]=region["captionBoundary"]
72
+ issues["fig_type"] = region["figType"]
73
+ issues["description"]="Location of the caption for figures must be below the table."
74
+ issueList.append(issues)
75
+
76
  return issueList
77
+
78
  else:
79
  raise Exception(f"JSON output file not found: {output_json_file}")
80
+
81
  except subprocess.CalledProcessError as e:
82
  raise Exception(f"Error while running pdffigures2: {e.stderr}")
83
 
84
+ def process_file(file):
85
+ """
86
+ Wrapper to process a file via grade_pdf and return JSON data.
87
+ Args:
88
+ file (str): Path to the uploaded PDF file.
89
+ Returns:
90
+ str: JSON string of extracted data.
91
+ """
92
+ # Define paths
93
+ pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
94
+
95
+ temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
96
+ temp_input.write(file)
97
+ temp_input_path = temp_input.name
98
+ print("Path of the input file: ",temp_input_path)
99
 
100
  try:
101
+ # Process the PDF and get JSON data
102
+ # print(file.name)
103
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
104
+ # Clean up the temporary input file
105
  os.unlink(temp_input_path)
106
+
107
+
108
+ # Return JSON as a formatted string
109
  return json.dumps(extracted_data, indent=4)
110
  except Exception as e:
111
  return f"An error occurred: {e}"
112
 
113
+ # Gradio Interface
114
  def gradio_interface():
115
  interface = gr.Interface(
116
  fn=process_file,
117
+
118
+ inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
119
+
120
+ outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
121
+
122
  title="PDF Grading Interface",
123
+
124
  description="Upload a PDF file, and this tool will extract figures and captions as JSON."
125
  )
126
  return interface
127
 
128
  if __name__ == "__main__":
129
+ # Launch the Gradio app on the correct host/port for Hugging Face
130
+ gradio_interface().launch(server_name="0.0.0.0", server_port=7860)
131
+