import json import os from dotenv import load_dotenv from openai import OpenAI from tqdm import tqdm from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES from scripts.utility_functions import render_prompt from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text # Load environment variables from .env file load_dotenv() #nlp = spacy.load("de_core_news_sm") api_key = os.getenv("OPENAI_API_KEY") openai_client = OpenAI(api_key=api_key) def create_prompt_without_nlp_insights(text): return render_prompt(text, include_nlp=False) def classify_changes_without_nlp_insights(text_content, location_info): """Classify changes in text chunks using OpenAI.""" response = openai_client.chat.completions.create( model="gpt-4o-mini", messages=[ { "role": "system", "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.", }, { "role": "user", "content": create_prompt_without_nlp_insights(text_content), }, ], temperature=0.7, max_tokens=1024, ) try: result = json.loads(response.choices[0].message.content) if result.get("changes_detected", False): result["location"] = location_info result["source_text"] = text_content return result return None except json.JSONDecodeError: return None def traverse_blocks( blocks, parent=None, grandparent=None, results=None, is_top_level=True ): """Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes.""" if results is None: results = [] iterable = ( tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks ) for block in iterable: # Add parent and grandparent references to the block for context tracking block["parent"] = parent if "children" in block and ( not block["children"] or len(block["children"]) == 0 ): # This is a leaf node # Extract hierarchical text text_content = extract_hierarchical_text(block) # Define location info location_info = { "page_number": block["page_number"], "block_text": block["text"], } # Analyze the text for changes changes = classify_changes_without_nlp_insights(text_content, location_info) if changes: # Add the full hierarchical text to the result changes["text"] = text_content results.append(changes) else: traverse_blocks( block["children"], block, parent, results, is_top_level=False ) return results def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure): """Main function to detect regulatory changes in the hierarchical structure.""" if not hierarchical_structure: return {"error": "No hierarchical structure provided"} analysis_summary = { "total_changes_detected": 0, "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0}, } changes_by_page = {} # Traverse the blocks and analyze leaf nodes results = traverse_blocks(hierarchical_structure["blocks"]) # Update analysis summary for change in results: analysis_summary["total_changes_detected"] += len(change["classifications"]) for classification in change["classifications"]: change_type = classification["change_type"] analysis_summary["changes_by_type"][change_type] += 1 # Group changes by page number page_number = change["location"]["page_number"] if page_number not in changes_by_page: changes_by_page[page_number] = [] change_subtype = ( "context" if classification["change"] in CONTEXT_CATEGORIES else "scope" ) changes_by_page[page_number].append( { "change": classification["change"], "change_type": classification["change_type"], "change_subtype": change_subtype, "relevant_text": classification["relevant_text"], "text": change["text"], "explanation": classification["explanation"], } ) # Combine analysis summary and grouped changes final_output = { "analysis_summary": analysis_summary, "changes_by_page": changes_by_page, } return final_output, results