|
|
import json |
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from openai import OpenAI |
|
|
from tqdm import tqdm |
|
|
from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES |
|
|
from scripts.utility_functions import call_nlp_service, render_prompt |
|
|
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
api_key = os.getenv("OPENAI_API_KEY") |
|
|
openai_client = OpenAI(api_key=api_key) |
|
|
|
|
|
|
|
|
def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50): |
|
|
result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm") |
|
|
return result["chunks"], result["preprocessed_data"] |
|
|
|
|
|
|
|
|
def create_prompt(chunk, preprocessed_data): |
|
|
return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data) |
|
|
|
|
|
|
|
|
def search_for_regulatory_changes(chunks, preprocessed_data, subtitle): |
|
|
results = [] |
|
|
|
|
|
for chunk in chunks: |
|
|
response = openai_client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.", |
|
|
}, |
|
|
{"role": "user", "content": create_prompt(chunk, preprocessed_data)}, |
|
|
], |
|
|
temperature=0.7, |
|
|
max_tokens=1024, |
|
|
) |
|
|
|
|
|
try: |
|
|
result = json.loads(response.choices[0].message.content) |
|
|
if result.get("changes_detected", False): |
|
|
result["location"] = {"subtitle": subtitle} |
|
|
result["source_text"] = chunk |
|
|
results.append(result) |
|
|
except json.JSONDecodeError: |
|
|
continue |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def detect_regulatory_changes(text_content, subtitle): |
|
|
""" |
|
|
Main function to detect regulatory changes from text content. |
|
|
|
|
|
Args: |
|
|
text_content (str): The raw text content to analyze |
|
|
subtitle (str): The subtitle associated with the content |
|
|
|
|
|
Returns: |
|
|
dict: Structured output containing detected changes and analysis summary |
|
|
""" |
|
|
|
|
|
|
|
|
chunks, preprocessed_data = preprocess_text_with_nlp(text_content) |
|
|
|
|
|
|
|
|
results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def llm_regulatory_change_detector(hierarchical_structure): |
|
|
if hierarchical_structure: |
|
|
analysis_summary = { |
|
|
"total_changes_detected": 0, |
|
|
"changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0}, |
|
|
} |
|
|
subtitles = {} |
|
|
|
|
|
|
|
|
for section in tqdm( |
|
|
hierarchical_structure["sections"], desc="Analyzing Sections" |
|
|
): |
|
|
subtitle = section["subtitle"] |
|
|
content = section["content"] |
|
|
if isinstance(content, list): |
|
|
content = "\n".join(content) |
|
|
|
|
|
|
|
|
changes = detect_regulatory_changes(content, subtitle) |
|
|
|
|
|
|
|
|
for change in changes: |
|
|
analysis_summary["total_changes_detected"] += len( |
|
|
change["classifications"] |
|
|
) |
|
|
for classification in change["classifications"]: |
|
|
change_type = classification["change_type"] |
|
|
analysis_summary["changes_by_type"][f"{change_type}s"] += 1 |
|
|
|
|
|
|
|
|
subtitles[subtitle] = [] |
|
|
for change in changes: |
|
|
for classification in change["classifications"]: |
|
|
change_subtype = ( |
|
|
"context" |
|
|
if classification["change"] in CONTEXT_CATEGORIES |
|
|
else "scope" |
|
|
) |
|
|
subtitles[subtitle].append( |
|
|
{ |
|
|
"change": classification["change"], |
|
|
"change_type": classification["change_type"], |
|
|
"change_subtype": change_subtype, |
|
|
"relevant_text": classification["relevant_text"], |
|
|
"explanation": classification["explanation"], |
|
|
"nlp_evidence": classification["evidence"], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
final_output = {"analysis_summary": analysis_summary, "results": subtitles} |
|
|
|
|
|
return final_output |
|
|
|