regulens

Sleeping

App Files Files Community

regulens / scripts /pymupdf_no_nlp_preprocessing.py

amougou-fortiss

Upload 9 files

ce77033 verified 7 months ago

raw

history blame contribute delete

4.82 kB

	import json
	import os
	from dotenv import load_dotenv
	from openai import OpenAI
	from tqdm import tqdm
	from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
	from scripts.utility_functions import render_prompt
	from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text


	# Load environment variables from .env file
	load_dotenv()

	#nlp = spacy.load("de_core_news_sm")
	api_key = os.getenv("OPENAI_API_KEY")
	openai_client = OpenAI(api_key=api_key)


	def create_prompt_without_nlp_insights(text):
	return render_prompt(text, include_nlp=False)


	def classify_changes_without_nlp_insights(text_content, location_info):
	"""Classify changes in text chunks using OpenAI."""

	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{
	"role": "system",
	"content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
	},
	{
	"role": "user",
	"content": create_prompt_without_nlp_insights(text_content),
	},
	],
	temperature=0.7,
	max_tokens=1024,
	)

	try:
	result = json.loads(response.choices[0].message.content)
	if result.get("changes_detected", False):
	result["location"] = location_info
	result["source_text"] = text_content
	return result
	return None
	except json.JSONDecodeError:
	return None


	def traverse_blocks(
	blocks, parent=None, grandparent=None, results=None, is_top_level=True
	):
	"""Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes."""
	if results is None:
	results = []
	iterable = (
	tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks
	)

	for block in iterable:
	# Add parent and grandparent references to the block for context tracking
	block["parent"] = parent

	if "children" in block and (
	not block["children"] or len(block["children"]) == 0
	): # This is a leaf node
	# Extract hierarchical text
	text_content = extract_hierarchical_text(block)

	# Define location info
	location_info = {
	"page_number": block["page_number"],
	"block_text": block["text"],
	}

	# Analyze the text for changes
	changes = classify_changes_without_nlp_insights(text_content, location_info)
	if changes:
	# Add the full hierarchical text to the result
	changes["text"] = text_content
	results.append(changes)
	else:
	traverse_blocks(
	block["children"], block, parent, results, is_top_level=False
	)

	return results


	def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure):
	"""Main function to detect regulatory changes in the hierarchical structure."""
	if not hierarchical_structure:
	return {"error": "No hierarchical structure provided"}

	analysis_summary = {
	"total_changes_detected": 0,
	"changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
	}
	changes_by_page = {}

	# Traverse the blocks and analyze leaf nodes
	results = traverse_blocks(hierarchical_structure["blocks"])

	# Update analysis summary
	for change in results:
	analysis_summary["total_changes_detected"] += len(change["classifications"])

	for classification in change["classifications"]:
	change_type = classification["change_type"]
	analysis_summary["changes_by_type"][change_type] += 1

	# Group changes by page number
	page_number = change["location"]["page_number"]
	if page_number not in changes_by_page:
	changes_by_page[page_number] = []

	change_subtype = (
	"context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
	)
	changes_by_page[page_number].append(
	{
	"change": classification["change"],
	"change_type": classification["change_type"],
	"change_subtype": change_subtype,
	"relevant_text": classification["relevant_text"],
	"text": change["text"],
	"explanation": classification["explanation"],
	}
	)

	# Combine analysis summary and grouped changes
	final_output = {
	"analysis_summary": analysis_summary,
	"changes_by_page": changes_by_page,
	}

	return final_output, results