qtpi commited on
Commit
afbc49c
·
verified ·
1 Parent(s): 5452023

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -21
app.py CHANGED
@@ -5,6 +5,9 @@ import gradio as gr
5
  import tempfile
6
 
7
  def grade_pdf(input_pdf, pdffigures2_jar):
 
 
 
8
  if not os.path.exists(input_pdf):
9
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
10
  if not os.path.exists(pdffigures2_jar):
@@ -20,64 +23,71 @@ def grade_pdf(input_pdf, pdffigures2_jar):
20
  subprocess.run(command, capture_output=True, text=True, check=True)
21
 
22
  output_json_file = "_" + os.path.splitext(os.path.basename(input_pdf))[0] + ".json"
23
-
24
  if os.path.exists(output_json_file):
25
  with open(output_json_file, "r") as f:
26
  regions = json.load(f)
27
 
28
- issue_list = []
29
 
30
  for region in regions:
31
  issues = {}
32
- if region["captionBoundary"]["y1"] > region["regionBoundary"]["y1"]:
33
- region["captionLocation"] = "Below"
34
- else:
35
- region["captionLocation"] = "Above"
36
 
37
  if region["captionLocation"] == "Below" and region["figType"] == "Table":
38
- issues["page_number"] = region["page"]+1
39
  issues["caption_coordinate"] = region["captionBoundary"]
40
  issues["fig_type"] = region["figType"]
41
  issues["caption_location"] = region["captionLocation"]
42
  issues["description"] = "Location of the caption for tables must be above the table."
43
- issue_list.append(issues)
44
 
45
  if region["captionLocation"] == "Above" and region["figType"] == "Figure":
46
- issues["page_number"] = region["page"]+1
47
  issues["caption_coordinate"] = region["captionBoundary"]
48
  issues["fig_type"] = region["figType"]
49
  issues["caption_location"] = region["captionLocation"]
50
  issues["description"] = "Location of the caption for figures must be below the figure."
51
- issue_list.append(issues)
52
 
53
- return issue_list
54
  else:
55
  raise Exception(f"JSON output file not found: {output_json_file}")
56
 
57
  except subprocess.CalledProcessError as e:
58
- raise Exception(f"Error while running pdffigures2:\nSTDOUT:\n{e.stdout}\nSTDERR:\n{e.stderr}")
59
 
60
  def process_file(file):
 
 
 
61
  pdffigures2_jar_path = "pdffigures2.jar"
62
 
63
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", mode="wb") as temp_input:
64
- temp_input.write(file.read())
 
 
65
  temp_input_path = temp_input.name
66
 
67
- try:
68
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
69
- os.unlink(temp_input_path)
70
  return json.dumps(extracted_data, indent=4)
 
71
  except Exception as e:
72
- return f"An error occurred:\n{str(e)}"
 
 
 
 
73
 
74
  def gradio_interface():
75
  interface = gr.Interface(
76
  fn=process_file,
77
- inputs=gr.File(label="Upload PDF File", file_types=[".pdf"]),
78
- outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
79
- title="PDF Grading Interface",
80
- description="Upload a PDF to detect improper caption positions for tables and figures."
81
  )
82
  return interface
83
 
 
5
  import tempfile
6
 
7
  def grade_pdf(input_pdf, pdffigures2_jar):
8
+ """
9
+ Process a PDF file using pdffigures2 and return the extracted JSON output.
10
+ """
11
  if not os.path.exists(input_pdf):
12
  raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
13
  if not os.path.exists(pdffigures2_jar):
 
23
  subprocess.run(command, capture_output=True, text=True, check=True)
24
 
25
  output_json_file = "_" + os.path.splitext(os.path.basename(input_pdf))[0] + ".json"
 
26
  if os.path.exists(output_json_file):
27
  with open(output_json_file, "r") as f:
28
  regions = json.load(f)
29
 
30
+ issueList = []
31
 
32
  for region in regions:
33
  issues = {}
34
+ caption_y = region["captionBoundary"]["y1"]
35
+ region_y = region["regionBoundary"]["y1"]
36
+ region["captionLocation"] = "Below" if caption_y > region_y else "Above"
 
37
 
38
  if region["captionLocation"] == "Below" and region["figType"] == "Table":
39
+ issues["page_number"] = region["page"]
40
  issues["caption_coordinate"] = region["captionBoundary"]
41
  issues["fig_type"] = region["figType"]
42
  issues["caption_location"] = region["captionLocation"]
43
  issues["description"] = "Location of the caption for tables must be above the table."
44
+ issueList.append(issues)
45
 
46
  if region["captionLocation"] == "Above" and region["figType"] == "Figure":
47
+ issues["page_number"] = region["page"]
48
  issues["caption_coordinate"] = region["captionBoundary"]
49
  issues["fig_type"] = region["figType"]
50
  issues["caption_location"] = region["captionLocation"]
51
  issues["description"] = "Location of the caption for figures must be below the figure."
52
+ issueList.append(issues)
53
 
54
+ return issueList
55
  else:
56
  raise Exception(f"JSON output file not found: {output_json_file}")
57
 
58
  except subprocess.CalledProcessError as e:
59
+ raise Exception(f"Error while running pdffigures2: {e.stderr}")
60
 
61
  def process_file(file):
62
+ """
63
+ Wrapper to process a file via grade_pdf and return JSON data or error as a string.
64
+ """
65
  pdffigures2_jar_path = "pdffigures2.jar"
66
 
67
+ temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
68
+ try:
69
+ temp_input.write(file)
70
+ temp_input.close()
71
  temp_input_path = temp_input.name
72
 
 
73
  extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
74
+
75
  return json.dumps(extracted_data, indent=4)
76
+
77
  except Exception as e:
78
+ return f"An error occurred: {str(e)}"
79
+
80
+ finally:
81
+ if os.path.exists(temp_input.name):
82
+ os.unlink(temp_input.name)
83
 
84
  def gradio_interface():
85
  interface = gr.Interface(
86
  fn=process_file,
87
+ inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
88
+ outputs=gr.Textbox(label="Output", lines=20),
89
+ title="PDF Caption Grader",
90
+ description="Detects misplacement of captions in figures and tables in PDF files."
91
  )
92
  return interface
93