Spaces:

amougou-mbida
/

regulens

Running

regulens / scripts /pdf_text_extractor.py

Maximilian Amougou

Upload 8 files

bdc7d9a verified 5 months ago

5.18 kB

	import io
	import json
	import re
	import pdfplumber
	import pymupdf
	from dotenv import load_dotenv
	import os
	from openai import OpenAI

	# Load environment variables from .env file
	load_dotenv()

	api_key = os.getenv("OPENAI_API_KEY")
	openai_client = OpenAI(api_key=api_key)


	def create_hierarchical_structure_by_pymupdf(pdf_input: str \| bytes):
	"""
	Create a hierarchical structure of text blocks from a PDF file using PyMuPDF.
	"""
	if isinstance(pdf_input, (str, os.PathLike)):
	document = pymupdf.open(pdf_input)
	elif isinstance(pdf_input, bytes):
	document = pymupdf.open(stream=pdf_input, filetype="pdf")
	else:
	return {"blocks": []}

	structured_data = {"blocks": []}

	# Stack to keep track of hierarchical levels based on x0
	hierarchy_stack = []

	# Threshold for considering blocks at the same level
	x0_threshold = 1.5

	for page_num in range(len(document)):
	page = document[page_num]
	blocks = page.get_text("blocks") # Extract text blocks

	for block in blocks:
	x0, y0, x1, y1, text, block_no, block_type = block

	# Skip empty text blocks
	if not text.strip():
	continue

	block_data = {
	"page_number": page_num,
	"coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
	"text": text.strip(),
	"children": [],
	}

	# Determine the correct hierarchical level for the current block
	while (
	hierarchy_stack
	and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold
	):
	hierarchy_stack.pop()

	if hierarchy_stack:
	# Add the current block as a child of the last block in the stack
	hierarchy_stack[-1]["children"].append(block_data)
	else:
	# If the stack is empty, add the block to the top level
	structured_data["blocks"].append(block_data)

	# Push the current block onto the stack
	hierarchy_stack.append(block_data)

	return structured_data


	def extract_text_from_pdf(pdf_input: str \| bytes):
	"""Extract text from a PDF file."""

	text = ""
	with pdfplumber.open(
	io.BytesIO(pdf_input)
	) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text


	def ask_openai_to_structure_text(text):
	"""Use OpenAI API to structure the text into a hierarchical format."""

	prompt = f"""
	Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content.
	The main goal is to associate a content to a title or subtitle.
	Keep the same hierarchy of the text.
	Dont summarize the text, just structure it.
	Include all the pages of the text in the structure.
	You have to return a JSON which always has the name of the keys of the example output even for documents with other formats.
	Within the content key, you can have a list of strings representing the content
	Ensure you return only a valid JSON.

	Text:
	{text}

	Example Output:
	{{
	"title": "Main Title",
	"sections": [
	{{
	"subtitle": "Subtitle 1",
	"content": [
	"Content related to Subtitle 1.",
	"More content related to Subtitle 1."
	]
	}},
	{{
	"subtitle": "Subtitle 2",
	"content": [
	"Content related to Subtitle 2.",
	"More content related to Subtitle 2."
	]

	}}
	]
	}}
	"""

	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{
	"role": "system",
	"content": "You are a helpful assistant that extract text from Pdf documents",
	},
	{"role": "user", "content": prompt},
	],
	)

	# Extract the content from the response
	response_text = response.choices[0].message.content

	# Remove Markdown code blocks (if present)
	response_text = re.sub(r"```json\|```", "", response_text).strip()

	return response_text


	def create_hierarchical_structure_by_llm(pdf_input: str \| bytes):
	"""Create a hierarchical structure for a PDF document from a path or bytes."""

	# Step 1: Extract text from the PDF
	if isinstance(pdf_input, (str, os.PathLike)) \| isinstance(pdf_input, bytes):
	text = extract_text_from_pdf(pdf_input)
	else:
	raise ValueError("pdf_input must be a file path or bytes.")

	# Step 2: Ask OpenAI to structure the text
	structured_text = ask_openai_to_structure_text(text)

	# Step 3: Parse the structured text into a Python dictionary
	try:
	hierarchical_structure = json.loads(structured_text)
	except json.JSONDecodeError as e:
	print("Error parsing JSON response from OpenAI:", e)
	print("Raw response:", structured_text)
	return None

	return hierarchical_structure