import subprocess import json import os import gradio as gr import tempfile # Set JAVA_HOME environment variable # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64' def grade_pdf(input_pdf, pdffigures2_jar): """ Process a PDF file using pdffigures2 and return the extracted JSON output. Args: input_pdf (str): Path to the input PDF file. output_dir (str): Path to the directory where JSON output will be saved. pdffigures2_jar (str): Path to the pdffigures2 JAR file. Returns: dict: Parsed JSON data extracted by pdffigures2. Raises: FileNotFoundError: If the input PDF or JAR file does not exist. Exception: If the command fails or the JSON output is not generated. """ # Check if input file and JAR file exist if not os.path.exists(input_pdf): raise FileNotFoundError(f"Input PDF file not found: {input_pdf}") if not os.path.exists(pdffigures2_jar): raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}") # Command to execute pdffigures2 command = [ "java", "-jar", pdffigures2_jar, input_pdf, "-d", '_' ] print(input_pdf) try: # Run the command subprocess.run(command, capture_output=True, text=True, check=True) # Construct the output JSON file path # print() output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json" print("output_json_file: ", output_json_file) # Read and return the JSON data if os.path.exists(output_json_file): with open(output_json_file, "r") as f: regions = json.load(f) issueList = list() for region in regions: issues = dict() if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]): region["captionLocation"]="Below" else: region["captionLocation"]="Above" if(region["captionLocation"]=="Below" and region["figType"] == "Table"): issues["page_number"]=region["page"]+1 issues["caption_coordinate"]=region["captionBoundary"] issues["fig_type"] = region["figType"] issues["caption_location"] = region["captionLocation"] issues["description"]="Location of the caption for tables must be above the table." issueList.append(issues) if(region["captionLocation"]=="Above" and region["figType"] == "Figure"): issues["page_number"]=region["page"]+1 issues["caption_coordinate"]=region["captionBoundary"] issues["fig_type"] = region["figType"] issues["description"]="Location of the caption for figures must be below the figure." issueList.append(issues) return issueList else: raise Exception(f"JSON output file not found: {output_json_file}") except subprocess.CalledProcessError as e: raise Exception(f"Error while running pdffigures2: {e.stderr}") def process_file(file): """ Wrapper to process a file via grade_pdf and return JSON data. Args: file (str): Path to the uploaded PDF file. Returns: str: JSON string of extracted data. """ # Define paths pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') temp_input.write(file) temp_input_path = temp_input.name print("Path of the input file: ",temp_input_path) try: # Process the PDF and get JSON data # print(file.name) extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path) # Clean up the temporary input file os.unlink(temp_input_path) # Return JSON as a formatted string return json.dumps(extracted_data, indent=4) except Exception as e: return f"An error occurred: {e}" # Gradio Interface def gradio_interface(): interface = gr.Interface( fn=process_file, inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"), outputs=gr.Textbox(label="Extracted JSON Data", lines=20), title="PDF Grading Interface", description="Upload a PDF file, and this tool will extract figures and captions as JSON." ) return interface if __name__ == "__main__": # Launch the Gradio app on the correct host/port for Hugging Face gradio_interface().launch(server_name="0.0.0.0", server_port=7860)