import io import json import re import pdfplumber import pymupdf from dotenv import load_dotenv import os from openai import OpenAI # Load environment variables from .env file load_dotenv() api_key = os.getenv("OPENAI_API_KEY") openai_client = OpenAI(api_key=api_key) def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes): """ Create a hierarchical structure of text blocks from a PDF file using PyMuPDF. """ if isinstance(pdf_input, (str, os.PathLike)): document = pymupdf.open(pdf_input) elif isinstance(pdf_input, bytes): document = pymupdf.open(stream=pdf_input, filetype="pdf") else: return {"blocks": []} structured_data = {"blocks": []} # Stack to keep track of hierarchical levels based on x0 hierarchy_stack = [] # Threshold for considering blocks at the same level x0_threshold = 1.5 for page_num in range(len(document)): page = document[page_num] blocks = page.get_text("blocks") # Extract text blocks for block in blocks: x0, y0, x1, y1, text, block_no, block_type = block # Skip empty text blocks if not text.strip(): continue block_data = { "page_number": page_num + 1, "coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1}, "text": text.strip(), "children": [], } # Determine the correct hierarchical level for the current block while ( hierarchy_stack and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold ): hierarchy_stack.pop() if hierarchy_stack: # Add the current block as a child of the last block in the stack hierarchy_stack[-1]["children"].append(block_data) else: # If the stack is empty, add the block to the top level structured_data["blocks"].append(block_data) # Push the current block onto the stack hierarchy_stack.append(block_data) return structured_data def extract_text_from_pdf(pdf_input: str | bytes): """Extract text from a PDF file.""" text = "" with pdfplumber.open( io.BytesIO(pdf_input) ) as pdf: for page in pdf.pages: text += page.extract_text() + "\n" return text def ask_openai_to_structure_text(text): """Use OpenAI API to structure the text into a hierarchical format.""" prompt = f""" Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content. The main goal is to associate a content to a title or subtitle. Keep the same hierarchy of the text. Dont summarize the text, just structure it. Include all the pages of the text in the structure. You have to return a JSON which always has the name of the keys of the example output even for documents with other formats. Within the content key, you can have a list of strings representing the content Ensure you return only a valid JSON. Text: {text} Example Output: {{ "title": "Main Title", "sections": [ {{ "subtitle": "Subtitle 1", "content": [ "Content related to Subtitle 1.", "More content related to Subtitle 1." ] }}, {{ "subtitle": "Subtitle 2", "content": [ "Content related to Subtitle 2.", "More content related to Subtitle 2." ] }} ] }} """ response = openai_client.chat.completions.create( model="gpt-4o-mini", messages=[ { "role": "system", "content": "You are a helpful assistant that extract text from Pdf documents", }, {"role": "user", "content": prompt}, ], ) # Extract the content from the response response_text = response.choices[0].message.content # Remove Markdown code blocks (if present) response_text = re.sub(r"```json|```", "", response_text).strip() return response_text def create_hierarchical_structure_by_llm(pdf_input: str | bytes): """Create a hierarchical structure for a PDF document from a path or bytes.""" # Step 1: Extract text from the PDF if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes): text = extract_text_from_pdf(pdf_input) else: raise ValueError("pdf_input must be a file path or bytes.") # Step 2: Ask OpenAI to structure the text structured_text = ask_openai_to_structure_text(text) # Step 3: Parse the structured text into a Python dictionary try: hierarchical_structure = json.loads(structured_text) except json.JSONDecodeError as e: print("Error parsing JSON response from OpenAI:", e) print("Raw response:", structured_text) return None return hierarchical_structure