|
|
import io |
|
|
import json |
|
|
import re |
|
|
import pdfplumber |
|
|
import pymupdf |
|
|
from dotenv import load_dotenv |
|
|
import os |
|
|
from openai import OpenAI |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
api_key = os.getenv("OPENAI_API_KEY") |
|
|
openai_client = OpenAI(api_key=api_key) |
|
|
|
|
|
|
|
|
def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes): |
|
|
""" |
|
|
Create a hierarchical structure of text blocks from a PDF file using PyMuPDF. |
|
|
""" |
|
|
if isinstance(pdf_input, (str, os.PathLike)): |
|
|
document = pymupdf.open(pdf_input) |
|
|
elif isinstance(pdf_input, bytes): |
|
|
document = pymupdf.open(stream=pdf_input, filetype="pdf") |
|
|
else: |
|
|
return {"blocks": []} |
|
|
|
|
|
structured_data = {"blocks": []} |
|
|
|
|
|
|
|
|
hierarchy_stack = [] |
|
|
|
|
|
|
|
|
x0_threshold = 1.5 |
|
|
|
|
|
for page_num in range(len(document)): |
|
|
page = document[page_num] |
|
|
blocks = page.get_text("blocks") |
|
|
|
|
|
for block in blocks: |
|
|
x0, y0, x1, y1, text, block_no, block_type = block |
|
|
|
|
|
|
|
|
if not text.strip(): |
|
|
continue |
|
|
|
|
|
block_data = { |
|
|
"page_number": page_num + 1, |
|
|
"coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1}, |
|
|
"text": text.strip(), |
|
|
"children": [], |
|
|
} |
|
|
|
|
|
|
|
|
while ( |
|
|
hierarchy_stack |
|
|
and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold |
|
|
): |
|
|
hierarchy_stack.pop() |
|
|
|
|
|
if hierarchy_stack: |
|
|
|
|
|
hierarchy_stack[-1]["children"].append(block_data) |
|
|
else: |
|
|
|
|
|
structured_data["blocks"].append(block_data) |
|
|
|
|
|
|
|
|
hierarchy_stack.append(block_data) |
|
|
|
|
|
return structured_data |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_input: str | bytes): |
|
|
"""Extract text from a PDF file.""" |
|
|
|
|
|
text = "" |
|
|
with pdfplumber.open( |
|
|
io.BytesIO(pdf_input) |
|
|
) as pdf: |
|
|
for page in pdf.pages: |
|
|
text += page.extract_text() + "\n" |
|
|
return text |
|
|
|
|
|
|
|
|
def ask_openai_to_structure_text(text): |
|
|
"""Use OpenAI API to structure the text into a hierarchical format.""" |
|
|
|
|
|
prompt = f""" |
|
|
Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content. |
|
|
The main goal is to associate a content to a title or subtitle. |
|
|
Keep the same hierarchy of the text. |
|
|
Dont summarize the text, just structure it. |
|
|
Include all the pages of the text in the structure. |
|
|
You have to return a JSON which always has the name of the keys of the example output even for documents with other formats. |
|
|
Within the content key, you can have a list of strings representing the content |
|
|
Ensure you return only a valid JSON. |
|
|
|
|
|
Text: |
|
|
{text} |
|
|
|
|
|
Example Output: |
|
|
{{ |
|
|
"title": "Main Title", |
|
|
"sections": [ |
|
|
{{ |
|
|
"subtitle": "Subtitle 1", |
|
|
"content": [ |
|
|
"Content related to Subtitle 1.", |
|
|
"More content related to Subtitle 1." |
|
|
] |
|
|
}}, |
|
|
{{ |
|
|
"subtitle": "Subtitle 2", |
|
|
"content": [ |
|
|
"Content related to Subtitle 2.", |
|
|
"More content related to Subtitle 2." |
|
|
] |
|
|
|
|
|
}} |
|
|
] |
|
|
}} |
|
|
""" |
|
|
|
|
|
response = openai_client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a helpful assistant that extract text from Pdf documents", |
|
|
}, |
|
|
{"role": "user", "content": prompt}, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
response_text = response.choices[0].message.content |
|
|
|
|
|
|
|
|
response_text = re.sub(r"```json|```", "", response_text).strip() |
|
|
|
|
|
return response_text |
|
|
|
|
|
|
|
|
def create_hierarchical_structure_by_llm(pdf_input: str | bytes): |
|
|
"""Create a hierarchical structure for a PDF document from a path or bytes.""" |
|
|
|
|
|
|
|
|
if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes): |
|
|
text = extract_text_from_pdf(pdf_input) |
|
|
else: |
|
|
raise ValueError("pdf_input must be a file path or bytes.") |
|
|
|
|
|
|
|
|
structured_text = ask_openai_to_structure_text(text) |
|
|
|
|
|
|
|
|
try: |
|
|
hierarchical_structure = json.loads(structured_text) |
|
|
except json.JSONDecodeError as e: |
|
|
print("Error parsing JSON response from OpenAI:", e) |
|
|
print("Raw response:", structured_text) |
|
|
return None |
|
|
|
|
|
return hierarchical_structure |
|
|
|