File size: 4,789 Bytes
a033fd2
 
 
 
9f68e72
a033fd2
38eb864
 
 
e31f53e
38eb864
 
 
 
 
 
 
 
 
 
 
 
 
e31f53e
 
a033fd2
 
 
38eb864
 
e31f53e
 
 
38eb864
e31f53e
38eb864
 
e31f53e
38eb864
e31f53e
a033fd2
38eb864
 
 
 
 
 
a033fd2
 
fda9ac3
 
38eb864
 
fda9ac3
38eb864
 
 
 
 
3d04d70
38eb864
a932dee
38eb864
 
 
 
afbc49c
38eb864
 
a932dee
38eb864
 
cc8bf84
38eb864
 
3d04d70
38eb864
a033fd2
e31f53e
38eb864
a033fd2
afbc49c
a033fd2
38eb864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a033fd2
da3917e
38eb864
 
9f68e72
38eb864
da3917e
38eb864
 
 
a033fd2
 
da3917e
afbc49c
38eb864
a033fd2
 
 
38eb864
 
 
 
 
da3917e
38eb864
da3917e
a033fd2
 
 
 
38eb864
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import subprocess
import json
import os
import gradio as gr
import tempfile

# Set JAVA_HOME environment variable
# os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'

def grade_pdf(input_pdf, pdffigures2_jar):
    """
    Process a PDF file using pdffigures2 and return the extracted JSON output.
    Args:
        input_pdf (str): Path to the input PDF file.
        output_dir (str): Path to the directory where JSON output will be saved.
        pdffigures2_jar (str): Path to the pdffigures2 JAR file.
    Returns:
        dict: Parsed JSON data extracted by pdffigures2.
    Raises:
        FileNotFoundError: If the input PDF or JAR file does not exist.
        Exception: If the command fails or the JSON output is not generated.
    """
    # Check if input file and JAR file exist
    if not os.path.exists(input_pdf):
        raise FileNotFoundError(f"Input PDF file not found: {input_pdf}")
    if not os.path.exists(pdffigures2_jar):
        raise FileNotFoundError(f"pdffigures2 JAR file not found: {pdffigures2_jar}")


    # Command to execute pdffigures2
    command = [
        "java", "-jar", pdffigures2_jar,
        input_pdf,
        "-d", '_'
    ]
    print(input_pdf)

    try:
        # Run the command
        subprocess.run(command, capture_output=True, text=True, check=True)

        # Construct the output JSON file path
        # print()
        output_json_file = "_"+os.path.splitext(os.path.basename(input_pdf))[0] +".json"
        print("output_json_file: ", output_json_file)

        # Read and return the JSON data
        if os.path.exists(output_json_file):
            with open(output_json_file, "r") as f:
                regions = json.load(f)

            issueList = list()

            for region in regions:
                issues = dict()
                if(region["captionBoundary"]["y1"]>region["regionBoundary"]["y1"]):
                    region["captionLocation"]="Below"
                else:
                    region["captionLocation"]="Above"

                if(region["captionLocation"]=="Below" and region["figType"] == "Table"):
                    issues["page_number"]=region["page"]+1
                    issues["caption_coordinate"]=region["captionBoundary"]
                    issues["fig_type"] = region["figType"]
                    issues["caption_location"] = region["captionLocation"]
                    issues["description"]="Location of the caption for tables must be above the table."
                    issueList.append(issues)
                    
                if(region["captionLocation"]=="Above" and region["figType"] == "Figure"):
                    issues["page_number"]=region["page"]+1
                    issues["caption_coordinate"]=region["captionBoundary"]
                    issues["fig_type"] = region["figType"]
                    issues["description"]="Location of the caption for figures must be below the figure."
                    issueList.append(issues)
                    
            return issueList

        else:
            raise Exception(f"JSON output file not found: {output_json_file}")

    except subprocess.CalledProcessError as e:
        raise Exception(f"Error while running pdffigures2: {e.stderr}")

def process_file(file):
    """
    Wrapper to process a file via grade_pdf and return JSON data.
    Args:
        file (str): Path to the uploaded PDF file.
    Returns:
        str: JSON string of extracted data.
    """
    # Define paths
    pdffigures2_jar_path = "pdffigures2.jar"  # Path to pdffigures2 JAR file
    
    temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
    temp_input.write(file)
    temp_input_path = temp_input.name
    print("Path of the input file: ",temp_input_path)

    try:
        # Process the PDF and get JSON data
        # print(file.name)
        extracted_data = grade_pdf(temp_input_path, pdffigures2_jar_path)
        # Clean up the temporary input file
        os.unlink(temp_input_path)
        

        # Return JSON as a formatted string
        return json.dumps(extracted_data, indent=4)
    except Exception as e:
        return f"An error occurred: {e}"

# Gradio Interface
def gradio_interface():
    interface = gr.Interface(
        fn=process_file,
        
        inputs=gr.File(label="Upload PDF File", file_types=[".pdf"], type="binary"),
        
        outputs=gr.Textbox(label="Extracted JSON Data", lines=20),
        
        title="PDF Grading Interface",
        
        description="Upload a PDF file, and this tool will extract figures and captions as JSON."
    )
    return interface

if __name__ == "__main__":
    # Launch the Gradio app on the correct host/port for Hugging Face
    gradio_interface().launch(server_name="0.0.0.0", server_port=7860)