|
|
import subprocess |
|
|
import json |
|
|
import os |
|
|
import gradio as gr |
|
|
import tempfile |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def grade_pdf(input_pdf, pdffigures2_jar): |
|
|
""" |
|
|
Process a PDF file using pdffigures2 and return the extracted JSON output. |
|
|
Args: |
|
|
input_pdf (str): Path to the input PDF file. |
|
|
output_dir (str): Path to the directory where JSON output will be saved. |
|
|
pdffigures2_jar (str): Path to the pdffigures2 JAR file. |
|
|
Returns: |
|
|
dict: Parsed JSON data extracted by pdffigures2. |
|
|
Raises: |
|
|
FileNotFoundError: If the input PDF or JAR file does not exist. |
|
|
Exception: If the command fails or the JSON output is not generated. |
|
|
""" |
|
|
|
|
|
if not os.path.exists(input_pdf): |
|
|
raise FileNotFoundError(f"Input PDF file not found: {input_pdf}") |
|
|
if not os.path.exists(pdffigures2_jar): |
|
|
raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}") |
|
|
|
|
|
|
|
|
|
|
|
command = [ |
|
|
"java", "-jar", pdffigures2_jar, |
|
|
input_pdf, |
|
|
"-d", '_' |
|
|
] |
|
|
print(input_pdf) |
|
|
|
|
|
try: |
|
|
|
|
|
subprocess.run(command, capture_output=True, text=True, check=True) |
|
|
|
|
|
|
|
|
|
|
|
output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json" |
|
|
print("output_json_file: ", output_json_file) |
|
|
|
|
|
|
|
|
if os.path.exists(output_json_file): |
|
|
with open(output_json_file, "r") as f: |
|
|
regions = json.load(f) |
|
|
|
|
|
issueList = list() |
|
|
|
|
|
for region in regions: |
|
|
issues = dict() |
|
|
if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]): |
|
|
region["captionLocation"]="Below" |
|
|
else: |
|
|
region["captionLocation"]="Above" |
|
|
|
|
|
if(region["captionLocation"]=="Below" and region["figType"] == "Table"): |
|
|
issues["page_number"]=region["page"]+1 |
|
|
issues["caption_coordinate"]=region["captionBoundary"] |
|
|
issues["fig_type"] = region["figType"] |
|
|
issues["caption_location"] = region["captionLocation"] |
|
|
issues["description"]="Location of the caption for tables must be above the table." |
|
|
issueList.append(issues) |
|
|
|
|
|
if(region["captionLocation"]=="Above" and region["figType"] == "Figure"): |
|
|
issues["page_number"]=region["page"]+1 |
|
|
issues["caption_coordinate"]=region["captionBoundary"] |
|
|
issues["fig_type"] = region["figType"] |
|
|
issues["description"]="Location of the caption for figures must be below the figure." |
|
|
issueList.append(issues) |
|
|
|
|
|
return issueList |
|
|
|
|
|
else: |
|
|
raise Exception(f"JSON output file not found: {output_json_file}") |
|
|
|
|
|
except subprocess.CalledProcessError as e: |
|
|
raise Exception(f"Error while running pdffigures2: {e.stderr}") |
|
|
|
|
|
def process_file(file): |
|
|
""" |
|
|
Wrapper to process a file via grade_pdf and return JSON data. |
|
|
Args: |
|
|
file (str): Path to the uploaded PDF file. |
|
|
Returns: |
|
|
str: JSON string of extracted data. |
|
|
""" |
|
|
|
|
|
pdffigures2_jar_path = "pdffigures2.jar" |
|
|
|
|
|
temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') |
|
|
temp_input.write(file) |
|
|
temp_input_path = temp_input.name |
|
|
print("Path of the input file: ",temp_input_path) |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path) |
|
|
|
|
|
os.unlink(temp_input_path) |
|
|
|
|
|
|
|
|
|
|
|
return json.dumps(extracted_data, indent=4) |
|
|
except Exception as e: |
|
|
return f"An error occurred: {e}" |
|
|
|
|
|
|
|
|
def gradio_interface(): |
|
|
interface = gr.Interface( |
|
|
fn=process_file, |
|
|
|
|
|
inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"), |
|
|
|
|
|
outputs=gr.Textbox(label="Extracted JSON Data", lines=20), |
|
|
|
|
|
title="PDF Grading Interface", |
|
|
|
|
|
description="Upload a PDF file, and this tool will extract figures and captions as JSON." |
|
|
) |
|
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
gradio_interface().launch(server_name="0.0.0.0", server_port=7860) |
|
|
|
|
|
|