regulens

Sleeping

File size: 4,816 Bytes

ce77033

import json
import os
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
from scripts.utility_functions import render_prompt
from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text


# Load environment variables from .env file
load_dotenv()

#nlp = spacy.load("de_core_news_sm")
api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=api_key)


def create_prompt_without_nlp_insights(text):
    return render_prompt(text, include_nlp=False)


def classify_changes_without_nlp_insights(text_content, location_info):
    """Classify changes in text chunks using OpenAI."""

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
            },
            {
                "role": "user",
                "content": create_prompt_without_nlp_insights(text_content),
            },
        ],
        temperature=0.7,
        max_tokens=1024,
    )

    try:
        result = json.loads(response.choices[0].message.content)
        if result.get("changes_detected", False):
            result["location"] = location_info
            result["source_text"] = text_content
            return result
        return None
    except json.JSONDecodeError:
        return None


def traverse_blocks(
    blocks, parent=None, grandparent=None, results=None, is_top_level=True
):
    """Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes."""
    if results is None:
        results = []
    iterable = (
        tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks
    )

    for block in iterable:
        # Add parent and grandparent references to the block for context tracking
        block["parent"] = parent

        if "children" in block and (
            not block["children"] or len(block["children"]) == 0
        ):  # This is a leaf node
            # Extract hierarchical text
            text_content = extract_hierarchical_text(block)

            # Define location info
            location_info = {
                "page_number": block["page_number"],
                "block_text": block["text"],
            }

            # Analyze the text for changes
            changes = classify_changes_without_nlp_insights(text_content, location_info)
            if changes:
                # Add the full hierarchical text to the result
                changes["text"] = text_content
                results.append(changes)
        else:
            traverse_blocks(
                block["children"], block, parent, results, is_top_level=False
            )

    return results


def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure):
    """Main function to detect regulatory changes in the hierarchical structure."""
    if not hierarchical_structure:
        return {"error": "No hierarchical structure provided"}

    analysis_summary = {
        "total_changes_detected": 0,
        "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
    }
    changes_by_page = {}

    # Traverse the blocks and analyze leaf nodes
    results = traverse_blocks(hierarchical_structure["blocks"])

    # Update analysis summary
    for change in results:
        analysis_summary["total_changes_detected"] += len(change["classifications"])

        for classification in change["classifications"]:
            change_type = classification["change_type"]
            analysis_summary["changes_by_type"][change_type] += 1

            # Group changes by page number
            page_number = change["location"]["page_number"]
            if page_number not in changes_by_page:
                changes_by_page[page_number] = []

            change_subtype = (
                "context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
            )
            changes_by_page[page_number].append(
                {
                    "change": classification["change"],
                    "change_type": classification["change_type"],
                    "change_subtype": change_subtype,
                    "relevant_text": classification["relevant_text"],
                    "text": change["text"],
                    "explanation": classification["explanation"],
                }
            )

    # Combine analysis summary and grouped changes
    final_output = {
        "analysis_summary": analysis_summary,
        "changes_by_page": changes_by_page,
    }

    return final_output, results