| import subprocess |
| import json |
| import os |
| import gradio as gr |
| import tempfile |
|
|
| |
| |
|
|
| def grade_pdf(input_pdf, pdffigures2_jar): |
| """ |
| Process a PDF file using pdffigures2 and return the extracted JSON output. |
| |
| Args: |
| input_pdf (str): Path to the input PDF file. |
| output_dir (str): Path to the directory where JSON output will be saved. |
| pdffigures2_jar (str): Path to the pdffigures2 JAR file. |
| |
| Returns: |
| dict: Parsed JSON data extracted by pdffigures2. |
| |
| Raises: |
| FileNotFoundError: If the input PDF or JAR file does not exist. |
| Exception: If the command fails or the JSON output is not generated. |
| """ |
| |
| if not os.path.exists(input_pdf): |
| raise FileNotFoundError(f"Input PDF file not found: {input_pdf}") |
| if not os.path.exists(pdffigures2_jar): |
| raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}") |
|
|
|
|
| |
| command = [ |
| "java", "-jar", pdffigures2_jar, |
| input_pdf, |
| "-d", '_' |
| ] |
| print(input_pdf) |
|
|
| try: |
| |
| subprocess.run(command, capture_output=True, text=True, check=True) |
|
|
| |
| |
| output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json" |
| print("output_json_file: ", output_json_file) |
|
|
| |
| if os.path.exists(output_json_file): |
| with open(output_json_file, "r") as f: |
| regions = json.load(f) |
|
|
| issueList = list() |
|
|
| for region in regions: |
| issues = dict() |
| if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]): |
| region["captionLocation"]="Below" |
| else: |
| region["captionLocation"]="Above" |
|
|
| if(region["captionLocation"]=="Below" and region["figType"] == "Table"): |
| issues["page_number"]=region["page"]+1 |
| issues["caption_coordinate"]=region["captionBoundary"] |
| issues["fig_type"] = region["figType"] |
| issues["caption_location"] = region["captionLocation"] |
| issues["description"]="Location of the caption for tables must be above the table." |
| issueList.append(issues) |
| |
| if(region["captionLocation"]=="Above" and region["figType"] == "Figure"): |
| issues["page_number"]=region["page"]+1 |
| issues["caption_coordinate"]=region["captionBoundary"] |
| issues["fig_type"] = region["figType"] |
| issues["description"]="Location of the caption for figures must be below the table." |
| issueList.append(issues) |
| |
| return issueList |
|
|
| else: |
| raise Exception(f"JSON output file not found: {output_json_file}") |
|
|
| except subprocess.CalledProcessError as e: |
| raise Exception(f"Error while running pdffigures2: {e.stderr}") |
|
|
| def process_file(file): |
| """ |
| Wrapper to process a file via grade_pdf and return JSON data. |
| |
| Args: |
| file (str): Path to the uploaded PDF file. |
| |
| Returns: |
| str: JSON string of extracted data. |
| """ |
| |
| pdffigures2_jar_path = "pdffigures2.jar" |
| |
| temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') |
| temp_input.write(file) |
| temp_input_path = temp_input.name |
| print("Path of the input file: ",temp_input_path) |
|
|
| try: |
| |
| |
| extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path) |
| |
| os.unlink(temp_input_path) |
| |
|
|
| |
| return json.dumps(extracted_data, indent=4) |
| except Exception as e: |
| return f"An error occurred: {e}" |
|
|
| |
| def gradio_interface(): |
| interface = gr.Interface( |
| fn=process_file, |
| |
| inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"), |
| |
| outputs=gr.Textbox(label="Extracted JSON Data", lines=20), |
| |
| title="PDF Grading Interface", |
| |
| description="Upload a PDF file, and this tool will extract figures and captions as JSON." |
| ) |
| return interface |
|
|
| if __name__ == "__main__": |
| |
| gradio_interface().launch() |