qtpi's picture
Update app.py
cc8bf84 verified
import subprocess
import json
import os
import gradio as gr
import tempfile
# Set JAVA_HOME environment variable
# os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
def grade_pdf(input_pdf, pdffigures2_jar):
"""
Process a PDF file using pdffigures2 and return the extracted JSON output.
Args:
input_pdf (str): Path to the input PDF file.
output_dir (str): Path to the directory where JSON output will be saved.
pdffigures2_jar (str): Path to the pdffigures2 JAR file.
Returns:
dict: Parsed JSON data extracted by pdffigures2.
Raises:
FileNotFoundError: If the input PDF or JAR file does not exist.
Exception: If the command fails or the JSON output is not generated.
"""
# Check if input file and JAR file exist
if not os.path.exists(input_pdf):
raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
if not os.path.exists(pdffigures2_jar):
raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")
# Command to execute pdffigures2
command = [
"java", "-jar", pdffigures2_jar,
input_pdf,
"-d", '_'
]
print(input_pdf)
try:
# Run the command
subprocess.run(command, capture_output=True, text=True, check=True)
# Construct the output JSON file path
# print()
output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
print("output_json_file: ", output_json_file)
# Read and return the JSON data
if os.path.exists(output_json_file):
with open(output_json_file, "r") as f:
regions = json.load(f)
issueList = list()
for region in regions:
issues = dict()
if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
region["captionLocation"]="Below"
else:
region["captionLocation"]="Above"
if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
issues["page_number"]=region["page"]+1
issues["caption_coordinate"]=region["captionBoundary"]
issues["fig_type"] = region["figType"]
issues["caption_location"] = region["captionLocation"]
issues["description"]="Location of the caption for tables must be above the table."
issueList.append(issues)
if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
issues["page_number"]=region["page"]+1
issues["caption_coordinate"]=region["captionBoundary"]
issues["fig_type"] = region["figType"]
issues["description"]="Location of the caption for figures must be below the figure."
issueList.append(issues)
return issueList
else:
raise Exception(f"JSON output file not found: {output_json_file}")
except subprocess.CalledProcessError as e:
raise Exception(f"Error while running pdffigures2: {e.stderr}")
def process_file(file):
"""
Wrapper to process a file via grade_pdf and return JSON data.
Args:
file (str): Path to the uploaded PDF file.
Returns:
str: JSON string of extracted data.
"""
# Define paths
pdffigures2_jar_path = "pdffigures2.jar" # Path to pdffigures2 JAR file
temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
temp_input.write(file)
temp_input_path = temp_input.name
print("Path of the input file: ",temp_input_path)
try:
# Process the PDF and get JSON data
# print(file.name)
extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
# Clean up the temporary input file
os.unlink(temp_input_path)
# Return JSON as a formatted string
return json.dumps(extracted_data, indent=4)
except Exception as e:
return f"An error occurred: {e}"
# Gradio Interface
def gradio_interface():
interface = gr.Interface(
fn=process_file,
inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
title="PDF Grading Interface",
description="Upload a PDF file, and this tool will extract figures and captions as JSON."
)
return interface
if __name__ == "__main__":
# Launch the Gradio app on the correct host/port for Hugging Face
gradio_interface().launch(server_name="0.0.0.0", server_port=7860)