regulens / scripts /pdf_text_extractor.py
amougou-fortiss's picture
Upload 9 files
ce77033 verified
import io
import json
import re
import pdfplumber
import pymupdf
from dotenv import load_dotenv
import os
from openai import OpenAI
# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=api_key)
def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes):
"""
Create a hierarchical structure of text blocks from a PDF file using PyMuPDF.
"""
if isinstance(pdf_input, (str, os.PathLike)):
document = pymupdf.open(pdf_input)
elif isinstance(pdf_input, bytes):
document = pymupdf.open(stream=pdf_input, filetype="pdf")
else:
return {"blocks": []}
structured_data = {"blocks": []}
# Stack to keep track of hierarchical levels based on x0
hierarchy_stack = []
# Threshold for considering blocks at the same level
x0_threshold = 1.5
for page_num in range(len(document)):
page = document[page_num]
blocks = page.get_text("blocks") # Extract text blocks
for block in blocks:
x0, y0, x1, y1, text, block_no, block_type = block
# Skip empty text blocks
if not text.strip():
continue
block_data = {
"page_number": page_num + 1,
"coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
"text": text.strip(),
"children": [],
}
# Determine the correct hierarchical level for the current block
while (
hierarchy_stack
and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold
):
hierarchy_stack.pop()
if hierarchy_stack:
# Add the current block as a child of the last block in the stack
hierarchy_stack[-1]["children"].append(block_data)
else:
# If the stack is empty, add the block to the top level
structured_data["blocks"].append(block_data)
# Push the current block onto the stack
hierarchy_stack.append(block_data)
return structured_data
def extract_text_from_pdf(pdf_input: str | bytes):
"""Extract text from a PDF file."""
text = ""
with pdfplumber.open(
io.BytesIO(pdf_input)
) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
def ask_openai_to_structure_text(text):
"""Use OpenAI API to structure the text into a hierarchical format."""
prompt = f"""
Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content.
The main goal is to associate a content to a title or subtitle.
Keep the same hierarchy of the text.
Dont summarize the text, just structure it.
Include all the pages of the text in the structure.
You have to return a JSON which always has the name of the keys of the example output even for documents with other formats.
Within the content key, you can have a list of strings representing the content
Ensure you return only a valid JSON.
Text:
{text}
Example Output:
{{
"title": "Main Title",
"sections": [
{{
"subtitle": "Subtitle 1",
"content": [
"Content related to Subtitle 1.",
"More content related to Subtitle 1."
]
}},
{{
"subtitle": "Subtitle 2",
"content": [
"Content related to Subtitle 2.",
"More content related to Subtitle 2."
]
}}
]
}}
"""
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that extract text from Pdf documents",
},
{"role": "user", "content": prompt},
],
)
# Extract the content from the response
response_text = response.choices[0].message.content
# Remove Markdown code blocks (if present)
response_text = re.sub(r"```json|```", "", response_text).strip()
return response_text
def create_hierarchical_structure_by_llm(pdf_input: str | bytes):
"""Create a hierarchical structure for a PDF document from a path or bytes."""
# Step 1: Extract text from the PDF
if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes):
text = extract_text_from_pdf(pdf_input)
else:
raise ValueError("pdf_input must be a file path or bytes.")
# Step 2: Ask OpenAI to structure the text
structured_text = ask_openai_to_structure_text(text)
# Step 3: Parse the structured text into a Python dictionary
try:
hierarchical_structure = json.loads(structured_text)
except json.JSONDecodeError as e:
print("Error parsing JSON response from OpenAI:", e)
print("Raw response:", structured_text)
return None
return hierarchical_structure