|
|
import json |
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from openai import OpenAI |
|
|
from tqdm import tqdm |
|
|
from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES |
|
|
from scripts.utility_functions import render_prompt |
|
|
from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text |
|
|
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
api_key = os.getenv("OPENAI_API_KEY") |
|
|
openai_client = OpenAI(api_key=api_key) |
|
|
|
|
|
|
|
|
def create_prompt_without_nlp_insights(text): |
|
|
return render_prompt(text, include_nlp=False) |
|
|
|
|
|
|
|
|
def classify_changes_without_nlp_insights(text_content, location_info): |
|
|
"""Classify changes in text chunks using OpenAI.""" |
|
|
|
|
|
response = openai_client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.", |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": create_prompt_without_nlp_insights(text_content), |
|
|
}, |
|
|
], |
|
|
temperature=0.7, |
|
|
max_tokens=1024, |
|
|
) |
|
|
|
|
|
try: |
|
|
result = json.loads(response.choices[0].message.content) |
|
|
if result.get("changes_detected", False): |
|
|
result["location"] = location_info |
|
|
result["source_text"] = text_content |
|
|
return result |
|
|
return None |
|
|
except json.JSONDecodeError: |
|
|
return None |
|
|
|
|
|
|
|
|
def traverse_blocks( |
|
|
blocks, parent=None, grandparent=None, results=None, is_top_level=True |
|
|
): |
|
|
"""Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes.""" |
|
|
if results is None: |
|
|
results = [] |
|
|
iterable = ( |
|
|
tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks |
|
|
) |
|
|
|
|
|
for block in iterable: |
|
|
|
|
|
block["parent"] = parent |
|
|
|
|
|
if "children" in block and ( |
|
|
not block["children"] or len(block["children"]) == 0 |
|
|
): |
|
|
|
|
|
text_content = extract_hierarchical_text(block) |
|
|
|
|
|
|
|
|
location_info = { |
|
|
"page_number": block["page_number"], |
|
|
"block_text": block["text"], |
|
|
} |
|
|
|
|
|
|
|
|
changes = classify_changes_without_nlp_insights(text_content, location_info) |
|
|
if changes: |
|
|
|
|
|
changes["text"] = text_content |
|
|
results.append(changes) |
|
|
else: |
|
|
traverse_blocks( |
|
|
block["children"], block, parent, results, is_top_level=False |
|
|
) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure): |
|
|
"""Main function to detect regulatory changes in the hierarchical structure.""" |
|
|
if not hierarchical_structure: |
|
|
return {"error": "No hierarchical structure provided"} |
|
|
|
|
|
analysis_summary = { |
|
|
"total_changes_detected": 0, |
|
|
"changes_by_type": {"addition": 0, "deletion": 0, "modification": 0}, |
|
|
} |
|
|
changes_by_page = {} |
|
|
|
|
|
|
|
|
results = traverse_blocks(hierarchical_structure["blocks"]) |
|
|
|
|
|
|
|
|
for change in results: |
|
|
analysis_summary["total_changes_detected"] += len(change["classifications"]) |
|
|
|
|
|
for classification in change["classifications"]: |
|
|
change_type = classification["change_type"] |
|
|
analysis_summary["changes_by_type"][change_type] += 1 |
|
|
|
|
|
|
|
|
page_number = change["location"]["page_number"] |
|
|
if page_number not in changes_by_page: |
|
|
changes_by_page[page_number] = [] |
|
|
|
|
|
change_subtype = ( |
|
|
"context" if classification["change"] in CONTEXT_CATEGORIES else "scope" |
|
|
) |
|
|
changes_by_page[page_number].append( |
|
|
{ |
|
|
"change": classification["change"], |
|
|
"change_type": classification["change_type"], |
|
|
"change_subtype": change_subtype, |
|
|
"relevant_text": classification["relevant_text"], |
|
|
"text": change["text"], |
|
|
"explanation": classification["explanation"], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
final_output = { |
|
|
"analysis_summary": analysis_summary, |
|
|
"changes_by_page": changes_by_page, |
|
|
} |
|
|
|
|
|
return final_output, results |
|
|
|