File size: 4,789 Bytes
a033fd2 9f68e72 a033fd2 38eb864 e31f53e 38eb864 e31f53e a033fd2 38eb864 e31f53e 38eb864 e31f53e 38eb864 e31f53e 38eb864 e31f53e a033fd2 38eb864 a033fd2 fda9ac3 38eb864 fda9ac3 38eb864 3d04d70 38eb864 a932dee 38eb864 afbc49c 38eb864 a932dee 38eb864 cc8bf84 38eb864 3d04d70 38eb864 a033fd2 e31f53e 38eb864 a033fd2 afbc49c a033fd2 38eb864 a033fd2 da3917e 38eb864 9f68e72 38eb864 da3917e 38eb864 a033fd2 da3917e afbc49c 38eb864 a033fd2 38eb864 da3917e 38eb864 da3917e a033fd2 38eb864 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import subprocess
import json
import os
import gradio as gr
import tempfile
# Set JAVA_HOME environment variable
# os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
def grade_pdf(input_pdf, pdffigures2_jar):
"""
Process a PDF file using pdffigures2 and return the extracted JSON output.
Args:
input_pdf (str): Path to the input PDF file.
output_dir (str): Path to the directory where JSON output will be saved.
pdffigures2_jar (str): Path to the pdffigures2 JAR file.
Returns:
dict: Parsed JSON data extracted by pdffigures2.
Raises:
FileNotFoundError: If the input PDF or JAR file does not exist.
Exception: If the command fails or the JSON output is not generated.
"""
# Check if input file and JAR file exist
if not os.path.exists(input_pdf):
raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
if not os.path.exists(pdffigures2_jar):
raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
# Command to execute pdffigures2
command = [
"java", "-jar", pdffigures2_jar,
input_pdf,
"-d", '_'
]
print(input_pdf)
try:
# Run the command
subprocess.run(command, capture_output=True, text=True, check=True)
# Construct the output JSON file path
# print()
output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
print("output_json_file: ", output_json_file)
# Read and return the JSON data
if os.path.exists(output_json_file):
with open(output_json_file, "r") as f:
regions = json.load(f)
issueList = list()
for region in regions:
issues = dict()
if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
region["captionLocation"]="Below"
else:
region["captionLocation"]="Above"
if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
issues["page_number"]=region["page"]+1
issues["caption_coordinate"]=region["captionBoundary"]
issues["fig_type"] = region["figType"]
issues["caption_location"] = region["captionLocation"]
issues["description"]="Location of the caption for tables must be above the table."
issueList.append(issues)
if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
issues["page_number"]=region["page"]+1
issues["caption_coordinate"]=region["captionBoundary"]
issues["fig_type"] = region["figType"]
issues["description"]="Location of the caption for figures must be below the figure."
issueList.append(issues)
return issueList
else:
raise Exception(f"JSON output file not found: {output_json_file}")
except subprocess.CalledProcessError as e:
raise Exception(f"Error while running pdffigures2: {e.stderr}")
def process_file(file):
"""
Wrapper to process a file via grade_pdf and return JSON data.
Args:
file (str): Path to the uploaded PDF file.
Returns:
str: JSON string of extracted data.
"""
# Define paths
pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
temp_input.write(file)
temp_input_path = temp_input.name
print("Path of the input file: ",temp_input_path)
try:
# Process the PDF and get JSON data
# print(file.name)
extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
# Clean up the temporary input file
os.unlink(temp_input_path)
# Return JSON as a formatted string
return json.dumps(extracted_data, indent=4)
except Exception as e:
return f"An error occurred: {e}"
# Gradio Interface
def gradio_interface():
interface = gr.Interface(
fn=process_file,
inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
title="PDF Grading Interface",
description="Upload a PDF file, and this tool will extract figures and captions as JSON."
)
return interface
if __name__ == "__main__":
# Launch the Gradio app on the correct host/port for Hugging Face
gradio_interface().launch(server_name="0.0.0.0", server_port=7860)
|