File size: 5,183 Bytes
ce77033
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import io
import json
import re
import pdfplumber
import pymupdf
from dotenv import load_dotenv
import os
from openai import OpenAI

# Load environment variables from .env file
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=api_key)


def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes):
    """
    Create a hierarchical structure of text blocks from a PDF file using PyMuPDF.
    """
    if isinstance(pdf_input, (str, os.PathLike)):
        document = pymupdf.open(pdf_input)
    elif isinstance(pdf_input, bytes):
        document = pymupdf.open(stream=pdf_input, filetype="pdf")
    else:
        return {"blocks": []}

    structured_data = {"blocks": []}

    # Stack to keep track of hierarchical levels based on x0
    hierarchy_stack = []

    # Threshold for considering blocks at the same level
    x0_threshold = 1.5

    for page_num in range(len(document)):
        page = document[page_num]
        blocks = page.get_text("blocks")  # Extract text blocks

        for block in blocks:
            x0, y0, x1, y1, text, block_no, block_type = block

            # Skip empty text blocks
            if not text.strip():
                continue

            block_data = {
                "page_number": page_num + 1,
                "coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
                "text": text.strip(),
                "children": [],
            }

            # Determine the correct hierarchical level for the current block
            while (
                hierarchy_stack
                and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold
            ):
                hierarchy_stack.pop()

            if hierarchy_stack:
                # Add the current block as a child of the last block in the stack
                hierarchy_stack[-1]["children"].append(block_data)
            else:
                # If the stack is empty, add the block to the top level
                structured_data["blocks"].append(block_data)

            # Push the current block onto the stack
            hierarchy_stack.append(block_data)

    return structured_data


def extract_text_from_pdf(pdf_input: str | bytes):
    """Extract text from a PDF file."""

    text = ""
    with pdfplumber.open(
        io.BytesIO(pdf_input)
    ) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text


def ask_openai_to_structure_text(text):
    """Use OpenAI API to structure the text into a hierarchical format."""

    prompt = f"""
    Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content.
    The main goal is to associate a content to a title or subtitle.
    Keep the same hierarchy of the text.
    Dont summarize the text, just structure it.
    Include all the pages of the text in the structure.
    You have to return a JSON which always has the name of the keys of the example output even for documents with other formats.
    Within the content key, you can have a list of strings representing the content
    Ensure you return only a valid JSON.

    Text:
    {text}

    Example Output:
    {{
        "title": "Main Title",
        "sections": [
            {{
                "subtitle": "Subtitle 1",
                "content": [
                    "Content related to Subtitle 1.",
                    "More content related to Subtitle 1."
                ]
            }},
            {{
                "subtitle": "Subtitle 2",
                "content": [
                    "Content related to Subtitle 2.",
                    "More content related to Subtitle 2."
                ] 
                
            }}
        ]
    }}
    """

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that extract text from Pdf documents",
            },
            {"role": "user", "content": prompt},
        ],
    )

    # Extract the content from the response
    response_text = response.choices[0].message.content

    # Remove Markdown code blocks (if present)
    response_text = re.sub(r"```json|```", "", response_text).strip()

    return response_text


def create_hierarchical_structure_by_llm(pdf_input: str | bytes):
    """Create a hierarchical structure for a PDF document from a path or bytes."""

    # Step 1: Extract text from the PDF
    if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes):
        text = extract_text_from_pdf(pdf_input)
    else:
        raise ValueError("pdf_input must be a file path or bytes.")

    # Step 2: Ask OpenAI to structure the text
    structured_text = ask_openai_to_structure_text(text)

    # Step 3: Parse the structured text into a Python dictionary
    try:
        hierarchical_structure = json.loads(structured_text)
    except json.JSONDecodeError as e:
        print("Error parsing JSON response from OpenAI:", e)
        print("Raw response:", structured_text)
        return None

    return hierarchical_structure