Spaces:
Running
Running
| import io | |
| import json | |
| import re | |
| import pdfplumber | |
| import pymupdf | |
| from dotenv import load_dotenv | |
| import os | |
| from openai import OpenAI | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| openai_client = OpenAI(api_key=api_key) | |
| def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes): | |
| """ | |
| Create a hierarchical structure of text blocks from a PDF file using PyMuPDF. | |
| """ | |
| if isinstance(pdf_input, (str, os.PathLike)): | |
| document = pymupdf.open(pdf_input) | |
| elif isinstance(pdf_input, bytes): | |
| document = pymupdf.open(stream=pdf_input, filetype="pdf") | |
| else: | |
| return {"blocks": []} | |
| structured_data = {"blocks": []} | |
| # Stack to keep track of hierarchical levels based on x0 | |
| hierarchy_stack = [] | |
| # Threshold for considering blocks at the same level | |
| x0_threshold = 1.5 | |
| for page_num in range(len(document)): | |
| page = document[page_num] | |
| blocks = page.get_text("blocks") # Extract text blocks | |
| for block in blocks: | |
| x0, y0, x1, y1, text, block_no, block_type = block | |
| # Skip empty text blocks | |
| if not text.strip(): | |
| continue | |
| block_data = { | |
| "page_number": page_num, | |
| "coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1}, | |
| "text": text.strip(), | |
| "children": [], | |
| } | |
| # Determine the correct hierarchical level for the current block | |
| while ( | |
| hierarchy_stack | |
| and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold | |
| ): | |
| hierarchy_stack.pop() | |
| if hierarchy_stack: | |
| # Add the current block as a child of the last block in the stack | |
| hierarchy_stack[-1]["children"].append(block_data) | |
| else: | |
| # If the stack is empty, add the block to the top level | |
| structured_data["blocks"].append(block_data) | |
| # Push the current block onto the stack | |
| hierarchy_stack.append(block_data) | |
| return structured_data | |
| def extract_text_from_pdf(pdf_input: str | bytes): | |
| """Extract text from a PDF file.""" | |
| text = "" | |
| with pdfplumber.open( | |
| io.BytesIO(pdf_input) | |
| ) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def ask_openai_to_structure_text(text): | |
| """Use OpenAI API to structure the text into a hierarchical format.""" | |
| prompt = f""" | |
| Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content. | |
| The main goal is to associate a content to a title or subtitle. | |
| Keep the same hierarchy of the text. | |
| Dont summarize the text, just structure it. | |
| Include all the pages of the text in the structure. | |
| You have to return a JSON which always has the name of the keys of the example output even for documents with other formats. | |
| Within the content key, you can have a list of strings representing the content | |
| Ensure you return only a valid JSON. | |
| Text: | |
| {text} | |
| Example Output: | |
| {{ | |
| "title": "Main Title", | |
| "sections": [ | |
| {{ | |
| "subtitle": "Subtitle 1", | |
| "content": [ | |
| "Content related to Subtitle 1.", | |
| "More content related to Subtitle 1." | |
| ] | |
| }}, | |
| {{ | |
| "subtitle": "Subtitle 2", | |
| "content": [ | |
| "Content related to Subtitle 2.", | |
| "More content related to Subtitle 2." | |
| ] | |
| }} | |
| ] | |
| }} | |
| """ | |
| response = openai_client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful assistant that extract text from Pdf documents", | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| ) | |
| # Extract the content from the response | |
| response_text = response.choices[0].message.content | |
| # Remove Markdown code blocks (if present) | |
| response_text = re.sub(r"```json|```", "", response_text).strip() | |
| return response_text | |
| def create_hierarchical_structure_by_llm(pdf_input: str | bytes): | |
| """Create a hierarchical structure for a PDF document from a path or bytes.""" | |
| # Step 1: Extract text from the PDF | |
| if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes): | |
| text = extract_text_from_pdf(pdf_input) | |
| else: | |
| raise ValueError("pdf_input must be a file path or bytes.") | |
| # Step 2: Ask OpenAI to structure the text | |
| structured_text = ask_openai_to_structure_text(text) | |
| # Step 3: Parse the structured text into a Python dictionary | |
| try: | |
| hierarchical_structure = json.loads(structured_text) | |
| except json.JSONDecodeError as e: | |
| print("Error parsing JSON response from OpenAI:", e) | |
| print("Raw response:", structured_text) | |
| return None | |
| return hierarchical_structure | |